git: e398922eaf66 - main - Enable M_TSTMP in Chelsio cxgbe driver by creating a mechanism that can sync the time.
Date: Tue, 20 Sep 2022 20:27:04 UTC
The branch main has been updated by rrs: URL: https://cgit.FreeBSD.org/src/commit/?id=e398922eaf66978b5e556f6b4b095693c865f329 commit e398922eaf66978b5e556f6b4b095693c865f329 Author: Randall Stewart <rrs@FreeBSD.org> AuthorDate: 2022-09-20 19:13:16 +0000 Commit: Randall Stewart <rrs@FreeBSD.org> CommitDate: 2022-09-20 19:13:16 +0000 Enable M_TSTMP in Chelsio cxgbe driver by creating a mechanism that can sync the time. Chelsio has always been recording a timestamp in the mbuf (rcv_tstmp) but not setting the M_TSTMP bit in the mbuf flags. This is because the timestamp was just the free running 60bit clock. This change fixes that so that we keep a synchronization by periodically (every 30 seconds after startup) getting the timestamp and the current nanosecond time. We always keep several sets around and the current one we always keep the current pair and the previous pair of timestamps. This allows us to setup a ratio between the two so we can correctly translate the time. Note that we use special care to split the timestamp into seconds (per the clock tick) and nanoseconds otherwise 64bit math would overflow. Reviewed by: np Sponsored by: Netflix Inc Differential Revision: https://reviews.freebsd.org/D36315 --- sys/dev/cxgbe/adapter.h | 14 +++++++ sys/dev/cxgbe/t4_main.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++- sys/dev/cxgbe/t4_sge.c | 86 +++++++++++++++++++++++++++++++++-------- 3 files changed, 182 insertions(+), 18 deletions(-) diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index f002f77fdd31..4080f04246c2 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -865,6 +865,15 @@ struct devnames { struct clip_entry; +#define CNT_CAL_INFO 3 +struct clock_sync { + uint64_t hw_cur; + uint64_t hw_prev; + uint64_t rt_cur; + uint64_t rt_prev; + uint32_t gen; +}; + struct adapter { SLIST_ENTRY(adapter) link; device_t dev; @@ -984,6 +993,11 @@ struct adapter { struct mtx sfl_lock; /* same cache-line as sc_lock? but that's ok */ TAILQ_HEAD(, sge_fl) sfl; struct callout sfl_callout; + struct callout cal_callout; + struct clock_sync cal_info[CNT_CAL_INFO]; + int cal_current; + int cal_count; + uint32_t cal_gen; /* * Driver code that can run when the adapter is suspended must use this diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 47caab160d29..e3236dab0ed0 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -320,6 +320,18 @@ static int t4_nofldtxq = -NOFLDTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0, "Number of offload TX queues per port"); +static int t4_clocksync_fast = 1; +SYSCTL_INT(_hw_cxgbe, OID_AUTO, csfast, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_clocksync_fast, 0, + "During initial clock sync how fast do we update in seconds"); + +static int t4_clocksync_normal = 30; +SYSCTL_INT(_hw_cxgbe, OID_AUTO, csnormal, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_clocksync_normal, 0, + "During normal clock sync how fast do we update in seconds"); + +static int t4_fast_2_normal = 30; +SYSCTL_INT(_hw_cxgbe, OID_AUTO, cscount, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_fast_2_normal, 0, + "How many clock syncs do we need to do to transition to slow"); + #define NOFLDRXQ 2 static int t4_nofldrxq = -NOFLDRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0, @@ -1109,6 +1121,79 @@ t4_ifnet_unit(struct adapter *sc, struct port_info *pi) return (-1); } +static inline uint64_t +t4_get_ns_timestamp(struct timespec *ts) +{ + return ((ts->tv_sec * 1000000000) + ts->tv_nsec); +} + +static void +t4_calibration(void *arg) +{ + struct adapter *sc; + struct timespec ts; + struct clock_sync *cur, *nex; + int next_up; + + sc = (struct adapter *)arg; + + cur = &sc->cal_info[sc->cal_current]; + next_up = (sc->cal_current + 1) % CNT_CAL_INFO; + nex = &sc->cal_info[next_up]; + if (__predict_false(sc->cal_count == 0)) { + /* First time in, just get the values in */ + cur->hw_cur = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO); + nanouptime(&ts); + cur->rt_cur = t4_get_ns_timestamp(&ts); + sc->cal_count++; + goto done; + } + nex->hw_prev = cur->hw_cur; + nex->rt_prev = cur->rt_cur; + KASSERT((hw_off_limits(sc) == 0), "hw_off_limits at t4_calibtration"); + nex->hw_cur = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO); + nanouptime(&ts); + nex->rt_cur = t4_get_ns_timestamp(&ts); + if ((nex->hw_cur - nex->hw_prev) == 0) { + /* The clock is not advancing? */ + sc->cal_count = 0; + atomic_store_rel_int(&cur->gen, 0); + goto done; + } + atomic_store_rel_int(&cur->gen, 0); + sc->cal_current = next_up; + sc->cal_gen++; + atomic_store_rel_int(&nex->gen, sc->cal_gen); + if (sc->cal_count < t4_fast_2_normal) + sc->cal_count++; +done: + callout_reset_sbt_curcpu(&sc->cal_callout, + ((sc->cal_count < t4_fast_2_normal) ? + t4_clocksync_fast : t4_clocksync_normal) * SBT_1S, 0, + t4_calibration, sc, C_DIRECT_EXEC); +} + + + +static void +t4_calibration_start(struct adapter *sc) +{ + /* + * Here if we have not done a calibration + * then do so otherwise start the appropriate + * timer. + */ + int i; + + for (i = 0; i < CNT_CAL_INFO; i++) { + sc->cal_info[i].gen = 0; + } + sc->cal_current = 0; + sc->cal_count = 0; + sc->cal_gen = 0; + t4_calibration(sc); +} + static int t4_attach(device_t dev) { @@ -1177,6 +1262,8 @@ t4_attach(device_t dev) callout_init(&sc->ktls_tick, 1); + callout_init(&sc->cal_callout, 1); + refcount_init(&sc->vxlan_refcount, 0); TASK_INIT(&sc->reset_task, 0, reset_adapter_task, sc); @@ -1567,6 +1654,7 @@ t4_attach(device_t dev) "failed to attach all child ports: %d\n", rc); goto done; } + t4_calibration_start(sc); device_printf(dev, "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n", @@ -1742,7 +1830,8 @@ t4_detach_common(device_t dev) free(pi, M_CXGBE); } } - + callout_stop(&sc->cal_callout); + callout_drain(&sc->cal_callout); device_delete_children(dev); sysctl_ctx_free(&sc->ctx); adapter_full_uninit(sc); @@ -1920,7 +2009,6 @@ t4_suspend(device_t dev) /* No more DMA or interrupts. */ stop_adapter(sc); - /* Quiesce all activity. */ for_each_port(sc, i) { pi = sc->port[i]; @@ -1993,6 +2081,10 @@ t4_suspend(device_t dev) quiesce_iq_fl(sc, &sc->sge.fwq, NULL); } + /* Stop calibration */ + callout_stop(&sc->cal_callout); + callout_drain(&sc->cal_callout); + /* Mark the adapter totally off limits. */ mtx_lock(&sc->reg_lock); atomic_set_int(&sc->error_flags, HW_OFF_LIMITS); @@ -2359,6 +2451,10 @@ t4_resume(device_t dev) } } } + + /* Reset all calibration */ + t4_calibration_start(sc); + done: if (rc == 0) { sc->incarnation++; diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 0ee8d709eca4..0dbd4e92a684 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -1520,15 +1520,73 @@ sort_before_lro(struct lro_ctrl *lro) } #endif +#define CGBE_SHIFT_SCALE 10 + static inline uint64_t -last_flit_to_ns(struct adapter *sc, uint64_t lf) +t4_tstmp_to_ns(struct adapter *sc, uint64_t lf) { - uint64_t n = be64toh(lf) & 0xfffffffffffffff; /* 60b, not 64b. */ + struct clock_sync *cur, dcur; + uint64_t tstmp_sec, tstmp_nsec; + uint64_t hw_clocks; + uint64_t rt_cur_to_prev, res_s, res_n, res_s_modulo, res; + uint64_t hw_clk_div, cclk; + uint64_t hw_tstmp = lf & 0xfffffffffffffffULL; /* 60b, not 64b. */ + uint32_t gen; - if (n > UINT64_MAX / 1000000) - return (n / sc->params.vpd.cclk * 1000000); - else - return (n * 1000000 / sc->params.vpd.cclk); + do { + cur = &sc->cal_info[sc->cal_current]; + gen = atomic_load_acq_int(&cur->gen); + if (gen == 0) + return (0); + dcur = *cur; + atomic_thread_fence_acq(); + } while (gen != dcur.gen); + + /* + * Our goal here is to have a result that is: + * + * ( (cur_time - prev_time) ) + * ((hw_tstmp - hw_prev) * ----------------------------- ) + prev_time + * ( (hw_cur - hw_prev) ) + * + * With the constraints that we cannot use float and we + * don't want to overflow the uint64_t numbers we are using. + * + * The plan is to take the clocking value of the hw timestamps + * and split them into seconds and nanosecond equivalent portions. + * Then we operate on the two portions seperately making sure to + * bring back the carry over from the seconds when we divide. + * + * First up lets get the two divided into separate entities + * i.e. the seconds. We use the clock frequency for this. + * Note that vpd.cclk is in khz, we need it in raw hz so + * convert to hz. + */ + cclk = sc->params.vpd.cclk * 1000; + hw_clocks = hw_tstmp - dcur.hw_prev; + tstmp_sec = hw_clocks / cclk; + tstmp_nsec = hw_clocks % cclk; + /* Now work with them separately */ + rt_cur_to_prev = (dcur.rt_cur - dcur.rt_prev); + res_s = tstmp_sec * rt_cur_to_prev; + res_n = tstmp_nsec * rt_cur_to_prev; + /* Now lets get our divider */ + hw_clk_div = dcur.hw_cur - dcur.hw_prev; + /* Make sure to save the remainder from the seconds divide */ + res_s_modulo = res_s % hw_clk_div; + res_s /= hw_clk_div; + /* scale the remainder to where it should be */ + res_s_modulo *= cclk; + /* Now add in the remainder */ + res_n += res_s_modulo; + /* Now do the divide */ + res_n /= hw_clk_div; + res_s *= cclk; + /* Recombine the two */ + res = res_s + res_n; + /* And now add in the base time to get to the real timestamp */ + res += dcur.rt_prev; + return (res); } static inline void @@ -2077,17 +2135,13 @@ have_mbuf: if (rxq->iq.flags & IQ_RX_TIMESTAMP) { /* - * Fill up rcv_tstmp but do not set M_TSTMP. - * rcv_tstmp is not in the format that the - * kernel expects and we don't want to mislead - * it. For now this is only for custom code - * that knows how to interpret cxgbe's stamp. + * Fill up rcv_tstmp but do not set M_TSTMP as + * long as we get a non-zero back from t4_tstmp_to_ns(). */ - m0->m_pkthdr.rcv_tstmp = - last_flit_to_ns(sc, d->rsp.u.last_flit); -#ifdef notyet - m0->m_flags |= M_TSTMP; -#endif + m0->m_pkthdr.rcv_tstmp = t4_tstmp_to_ns(sc, + be64toh(d->rsp.u.last_flit)); + if (m0->m_pkthdr.rcv_tstmp != 0) + m0->m_flags |= M_TSTMP; } #ifdef NUMA