svn commit: r311355 - stable/11/sys/dev/hyperv/netvsc
Sepherosa Ziehau
sephe at FreeBSD.org
Thu Jan 5 03:25:39 UTC 2017
Author: sephe
Date: Thu Jan 5 03:25:38 2017
New Revision: 311355
URL: https://svnweb.freebsd.org/changeset/base/311355
Log:
MFC 308905
hyperv/hn: Implement RNDIS multi-packet message support.
Currently, it is only applied to packet sent through chimney sending
buffers. Not enabled by default yet.
This one gives 20%~30% performance boost for non-TSO usage in both
bit/packet rate tests and nginx performance test.
Sponsored by: Microsoft
Differential Revision: https://reviews.freebsd.org/D8560
Modified:
stable/11/sys/dev/hyperv/netvsc/hn_rndis.c
stable/11/sys/dev/hyperv/netvsc/if_hn.c
stable/11/sys/dev/hyperv/netvsc/if_hnvar.h
Directory Properties:
stable/11/ (props changed)
Modified: stable/11/sys/dev/hyperv/netvsc/hn_rndis.c
==============================================================================
--- stable/11/sys/dev/hyperv/netvsc/hn_rndis.c Thu Jan 5 03:25:16 2017 (r311354)
+++ stable/11/sys/dev/hyperv/netvsc/hn_rndis.c Thu Jan 5 03:25:38 2017 (r311355)
@@ -838,11 +838,15 @@ hn_rndis_init(struct hn_softc *sc)
error = EIO;
goto done;
}
+ sc->hn_rndis_agg_size = comp->rm_pktmaxsz;
+ sc->hn_rndis_agg_pkts = comp->rm_pktmaxcnt;
+ sc->hn_rndis_agg_align = 1U << comp->rm_align;
+
if (bootverbose) {
if_printf(sc->hn_ifp, "RNDIS ver %u.%u, pktsz %u, pktcnt %u, "
"align %u\n", comp->rm_ver_major, comp->rm_ver_minor,
- comp->rm_pktmaxsz, comp->rm_pktmaxcnt,
- 1U << comp->rm_align);
+ sc->hn_rndis_agg_size, sc->hn_rndis_agg_pkts,
+ sc->hn_rndis_agg_align);
}
error = 0;
done:
Modified: stable/11/sys/dev/hyperv/netvsc/if_hn.c
==============================================================================
--- stable/11/sys/dev/hyperv/netvsc/if_hn.c Thu Jan 5 03:25:16 2017 (r311354)
+++ stable/11/sys/dev/hyperv/netvsc/if_hn.c Thu Jan 5 03:25:38 2017 (r311355)
@@ -159,10 +159,22 @@ __FBSDID("$FreeBSD$");
#define HN_CSUM_IP6_HWASSIST(sc) \
((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
+#define HN_PKTSIZE_MIN(align) \
+ roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
+ HN_RNDIS_PKT_LEN, (align))
+#define HN_PKTSIZE(m, align) \
+ roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
+
struct hn_txdesc {
#ifndef HN_USE_TXDESC_BUFRING
SLIST_ENTRY(hn_txdesc) link;
#endif
+ STAILQ_ENTRY(hn_txdesc) agg_link;
+
+ /* Aggregated txdescs, in sending order. */
+ STAILQ_HEAD(, hn_txdesc) agg_list;
+
+ /* The oldest packet, if transmission aggregation happens. */
struct mbuf *m;
struct hn_tx_ring *txr;
int refs;
@@ -180,6 +192,7 @@ struct hn_txdesc {
#define HN_TXD_FLAG_ONLIST 0x0001
#define HN_TXD_FLAG_DMAMAP 0x0002
+#define HN_TXD_FLAG_ONAGG 0x0004
struct hn_rxinfo {
uint32_t vlan_info;
@@ -259,6 +272,10 @@ static int hn_rxfilter_sysctl(SYSCTL_H
static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
static void hn_stop(struct hn_softc *);
static void hn_init_locked(struct hn_softc *);
@@ -306,7 +323,7 @@ static int hn_create_tx_data(struct hn
static void hn_fixup_tx_data(struct hn_softc *);
static void hn_destroy_tx_data(struct hn_softc *);
static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
-static int hn_encap(struct hn_tx_ring *,
+static int hn_encap(struct ifnet *, struct hn_tx_ring *,
struct hn_txdesc *, struct mbuf **);
static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
struct hn_txdesc *);
@@ -315,6 +332,10 @@ static void hn_set_tso_maxsize(struct
static bool hn_tx_ring_pending(struct hn_tx_ring *);
static void hn_tx_ring_qflush(struct hn_tx_ring *);
static void hn_resume_tx(struct hn_softc *, int);
+static void hn_set_txagg(struct hn_softc *);
+static void *hn_try_txagg(struct ifnet *,
+ struct hn_tx_ring *, struct hn_txdesc *,
+ int);
static int hn_get_txswq_depth(const struct hn_tx_ring *);
static void hn_txpkt_done(struct hn_nvs_sendctx *,
struct hn_softc *, struct vmbus_channel *,
@@ -430,6 +451,16 @@ SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_
&hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
#endif
+/* Packet transmission aggregation size limit */
+static int hn_tx_agg_size = -1;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
+ &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
+
+/* Packet transmission aggregation count limit */
+static int hn_tx_agg_pkts = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
+ &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
+
static u_int hn_cpu_index; /* next CPU for channel */
static struct taskqueue *hn_tx_taskq; /* shared TX taskqueue */
@@ -658,6 +689,84 @@ hn_set_rxfilter(struct hn_softc *sc)
return (error);
}
+static void
+hn_set_txagg(struct hn_softc *sc)
+{
+ uint32_t size, pkts;
+ int i;
+
+ /*
+ * Setup aggregation size.
+ */
+ if (sc->hn_agg_size < 0)
+ size = UINT32_MAX;
+ else
+ size = sc->hn_agg_size;
+
+ if (sc->hn_rndis_agg_size < size)
+ size = sc->hn_rndis_agg_size;
+
+ if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
+ /* Disable */
+ size = 0;
+ pkts = 0;
+ goto done;
+ }
+
+ /* NOTE: Type of the per TX ring setting is 'int'. */
+ if (size > INT_MAX)
+ size = INT_MAX;
+
+ /* NOTE: We only aggregate packets using chimney sending buffers. */
+ if (size > (uint32_t)sc->hn_chim_szmax)
+ size = sc->hn_chim_szmax;
+
+ /*
+ * Setup aggregation packet count.
+ */
+ if (sc->hn_agg_pkts < 0)
+ pkts = UINT32_MAX;
+ else
+ pkts = sc->hn_agg_pkts;
+
+ if (sc->hn_rndis_agg_pkts < pkts)
+ pkts = sc->hn_rndis_agg_pkts;
+
+ if (pkts <= 1) {
+ /* Disable */
+ size = 0;
+ pkts = 0;
+ goto done;
+ }
+
+ /* NOTE: Type of the per TX ring setting is 'short'. */
+ if (pkts > SHRT_MAX)
+ pkts = SHRT_MAX;
+
+done:
+ /* NOTE: Type of the per TX ring setting is 'short'. */
+ if (sc->hn_rndis_agg_align > SHRT_MAX) {
+ /* Disable */
+ size = 0;
+ pkts = 0;
+ }
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
+ size, pkts, sc->hn_rndis_agg_align);
+ }
+
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+ mtx_lock(&txr->hn_tx_lock);
+ txr->hn_agg_szmax = size;
+ txr->hn_agg_pktmax = pkts;
+ txr->hn_agg_align = sc->hn_rndis_agg_align;
+ mtx_unlock(&txr->hn_tx_lock);
+ }
+}
+
static int
hn_get_txswq_depth(const struct hn_tx_ring *txr)
{
@@ -785,6 +894,12 @@ hn_attach(device_t dev)
HN_LOCK_INIT(sc);
/*
+ * Initialize these tunables once.
+ */
+ sc->hn_agg_size = hn_tx_agg_size;
+ sc->hn_agg_pkts = hn_tx_agg_pkts;
+
+ /*
* Setup taskqueue for transmission.
*/
if (hn_tx_taskq == NULL) {
@@ -939,6 +1054,24 @@ hn_attach(device_t dev)
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
hn_rss_ind_sysctl, "IU", "RSS indirect table");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
+ CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
+ "RNDIS offered packet transmission aggregation size limit");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
+ CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
+ "RNDIS offered packet transmission aggregation count limit");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
+ CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
+ "RNDIS packet transmission aggregation alignment");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_txagg_size_sysctl, "I",
+ "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_txagg_pkts_sysctl, "I",
+ "Packet transmission aggregation packets, "
+ "0 -- disable, -1 -- auto");
/*
* Setup the ifmedia, which has been initialized earlier.
@@ -1189,16 +1322,45 @@ hn_txdesc_put(struct hn_tx_ring *txr, st
KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
("put an onlist txd %#x", txd->flags));
+ KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+ ("put an onagg txd %#x", txd->flags));
KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
if (atomic_fetchadd_int(&txd->refs, -1) != 1)
return 0;
+ if (!STAILQ_EMPTY(&txd->agg_list)) {
+ struct hn_txdesc *tmp_txd;
+
+ while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
+ int freed;
+
+ KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
+ ("resursive aggregation on aggregated txdesc"));
+ KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
+ ("not aggregated txdesc"));
+ KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
+ ("aggregated txdesc uses dmamap"));
+ KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
+ ("aggregated txdesc consumes "
+ "chimney sending buffer"));
+ KASSERT(tmp_txd->chim_size == 0,
+ ("aggregated txdesc has non-zero "
+ "chimney sending size"));
+
+ STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
+ tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
+ freed = hn_txdesc_put(txr, tmp_txd);
+ KASSERT(freed, ("failed to free aggregated txdesc"));
+ }
+ }
+
if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
("chim txd uses dmamap"));
hn_chim_free(txr->hn_sc, txd->chim_index);
txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+ txd->chim_size = 0;
} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
bus_dmamap_sync(txr->hn_tx_data_dtag,
txd->data_dmap, BUS_DMASYNC_POSTWRITE);
@@ -1253,8 +1415,11 @@ hn_txdesc_get(struct hn_tx_ring *txr)
atomic_subtract_int(&txr->hn_txdesc_avail, 1);
#endif
KASSERT(txd->m == NULL && txd->refs == 0 &&
+ STAILQ_EMPTY(&txd->agg_list) &&
txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
+ txd->chim_size == 0 &&
(txd->flags & HN_TXD_FLAG_ONLIST) &&
+ (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
(txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
txd->flags &= ~HN_TXD_FLAG_ONLIST;
txd->refs = 1;
@@ -1271,6 +1436,22 @@ hn_txdesc_hold(struct hn_txdesc *txd)
atomic_add_int(&txd->refs, 1);
}
+static __inline void
+hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
+{
+
+ KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+ ("recursive aggregation on aggregating txdesc"));
+
+ KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+ ("already aggregated"));
+ KASSERT(STAILQ_EMPTY(&txd->agg_list),
+ ("recursive aggregation on to-be-aggregated txdesc"));
+
+ txd->flags |= HN_TXD_FLAG_ONAGG;
+ STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
+}
+
static bool
hn_tx_ring_pending(struct hn_tx_ring *txr)
{
@@ -1382,12 +1563,123 @@ hn_rndis_pktinfo_append(struct rndis_pac
return (pi->rm_data);
}
+static __inline int
+hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
+{
+ struct hn_txdesc *txd;
+ struct mbuf *m;
+ int error, pkts;
+
+ txd = txr->hn_agg_txd;
+ KASSERT(txd != NULL, ("no aggregate txdesc"));
+
+ /*
+ * Since hn_txpkt() will reset this temporary stat, save
+ * it now, so that oerrors can be updated properly, if
+ * hn_txpkt() ever fails.
+ */
+ pkts = txr->hn_stat_pkts;
+
+ /*
+ * Since txd's mbuf will _not_ be freed upon hn_txpkt()
+ * failure, save it for later freeing, if hn_txpkt() ever
+ * fails.
+ */
+ m = txd->m;
+ error = hn_txpkt(ifp, txr, txd);
+ if (__predict_false(error)) {
+ /* txd is freed, but m is not. */
+ m_freem(m);
+
+ txr->hn_flush_failed++;
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
+ }
+
+ /* Reset all aggregation states. */
+ txr->hn_agg_txd = NULL;
+ txr->hn_agg_szleft = 0;
+ txr->hn_agg_pktleft = 0;
+ txr->hn_agg_prevpkt = NULL;
+
+ return (error);
+}
+
+static void *
+hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
+ int pktsize)
+{
+ void *chim;
+
+ if (txr->hn_agg_txd != NULL) {
+ if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
+ struct hn_txdesc *agg_txd = txr->hn_agg_txd;
+ struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
+ int olen;
+
+ /*
+ * Update the previous RNDIS packet's total length,
+ * it can be increased due to the mandatory alignment
+ * padding for this RNDIS packet. And update the
+ * aggregating txdesc's chimney sending buffer size
+ * accordingly.
+ *
+ * XXX
+ * Zero-out the padding, as required by the RNDIS spec.
+ */
+ olen = pkt->rm_len;
+ pkt->rm_len = roundup2(olen, txr->hn_agg_align);
+ agg_txd->chim_size += pkt->rm_len - olen;
+
+ /* Link this txdesc to the parent. */
+ hn_txdesc_agg(agg_txd, txd);
+
+ chim = (uint8_t *)pkt + pkt->rm_len;
+ /* Save the current packet for later fixup. */
+ txr->hn_agg_prevpkt = chim;
+
+ txr->hn_agg_pktleft--;
+ txr->hn_agg_szleft -= pktsize;
+ if (txr->hn_agg_szleft <=
+ HN_PKTSIZE_MIN(txr->hn_agg_align)) {
+ /*
+ * Probably can't aggregate more packets,
+ * flush this aggregating txdesc proactively.
+ */
+ txr->hn_agg_pktleft = 0;
+ }
+ /* Done! */
+ return (chim);
+ }
+ hn_flush_txagg(ifp, txr);
+ }
+ KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
+
+ txr->hn_tx_chimney_tried++;
+ txd->chim_index = hn_chim_alloc(txr->hn_sc);
+ if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
+ return (NULL);
+ txr->hn_tx_chimney++;
+
+ chim = txr->hn_sc->hn_chim +
+ (txd->chim_index * txr->hn_sc->hn_chim_szmax);
+
+ if (txr->hn_agg_pktmax > 1 &&
+ txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
+ txr->hn_agg_txd = txd;
+ txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
+ txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
+ txr->hn_agg_prevpkt = chim;
+ }
+ return (chim);
+}
+
/*
* NOTE:
* If this function fails, then both txd and m_head0 will be freed.
*/
static int
-hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
+hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
+ struct mbuf **m_head0)
{
bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
int error, nsegs, i;
@@ -1395,33 +1687,30 @@ hn_encap(struct hn_tx_ring *txr, struct
struct rndis_packet_msg *pkt;
uint32_t *pi_data;
void *chim = NULL;
- int pktlen;
+ int pkt_hlen, pkt_size;
pkt = txd->rndis_pkt;
- if (m_head->m_pkthdr.len + HN_RNDIS_PKT_LEN < txr->hn_chim_size) {
- /*
- * This packet is small enough to fit into a chimney sending
- * buffer. Try allocating one chimney sending buffer now.
- */
- txr->hn_tx_chimney_tried++;
- txd->chim_index = hn_chim_alloc(txr->hn_sc);
- if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
- chim = txr->hn_sc->hn_chim +
- (txd->chim_index * txr->hn_sc->hn_chim_szmax);
- /*
- * Directly fill the chimney sending buffer w/ the
- * RNDIS packet message.
- */
+ pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
+ if (pkt_size < txr->hn_chim_size) {
+ chim = hn_try_txagg(ifp, txr, txd, pkt_size);
+ if (chim != NULL)
pkt = chim;
- }
+ } else {
+ if (txr->hn_agg_txd != NULL)
+ hn_flush_txagg(ifp, txr);
}
pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
pkt->rm_dataoffset = sizeof(*pkt);
pkt->rm_datalen = m_head->m_pkthdr.len;
+ pkt->rm_oobdataoffset = 0;
+ pkt->rm_oobdatalen = 0;
+ pkt->rm_oobdataelements = 0;
pkt->rm_pktinfooffset = sizeof(*pkt);
pkt->rm_pktinfolen = 0;
+ pkt->rm_vchandle = 0;
+ pkt->rm_reserved = 0;
if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
/*
@@ -1482,7 +1771,7 @@ hn_encap(struct hn_tx_ring *txr, struct
*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
}
- pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
+ pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
/* Convert RNDIS packet message offsets */
pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
@@ -1491,25 +1780,36 @@ hn_encap(struct hn_tx_ring *txr, struct
* Fast path: Chimney sending.
*/
if (chim != NULL) {
- KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
- ("chimney buffer is not used"));
- KASSERT(pkt == chim, ("RNDIS pkt not in chimney buffer"));
+ struct hn_txdesc *tgt_txd = txd;
+
+ if (txr->hn_agg_txd != NULL) {
+ tgt_txd = txr->hn_agg_txd;
+#ifdef INVARIANTS
+ *m_head0 = NULL;
+#endif
+ }
+
+ KASSERT(pkt == chim,
+ ("RNDIS pkt not in chimney sending buffer"));
+ KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
+ ("chimney sending buffer is not used"));
+ tgt_txd->chim_size += pkt->rm_len;
m_copydata(m_head, 0, m_head->m_pkthdr.len,
- ((uint8_t *)chim) + pktlen);
+ ((uint8_t *)chim) + pkt_hlen);
- txd->chim_size = pkt->rm_len;
txr->hn_gpa_cnt = 0;
- txr->hn_tx_chimney++;
txr->hn_sendpkt = hn_txpkt_chim;
goto done;
}
+
+ KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
("chimney buffer is used"));
KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
- if (error) {
+ if (__predict_false(error)) {
int freed;
/*
@@ -1523,7 +1823,7 @@ hn_encap(struct hn_tx_ring *txr, struct
("fail to free txd upon txdma error"));
txr->hn_txdma_failed++;
- if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
return error;
}
*m_head0 = m_head;
@@ -1534,7 +1834,7 @@ hn_encap(struct hn_tx_ring *txr, struct
/* send packet with page buffer */
txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
- txr->hn_gpa[0].gpa_len = pktlen;
+ txr->hn_gpa[0].gpa_len = pkt_hlen;
/*
* Fill the page buffers with mbuf info after the page
@@ -1557,6 +1857,12 @@ done:
/* Set the completion routine */
hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
+ /* Update temporary stats for later use. */
+ txr->hn_stat_pkts++;
+ txr->hn_stat_size += m_head->m_pkthdr.len;
+ if (m_head->m_flags & M_MCAST)
+ txr->hn_stat_mcasts++;
+
return 0;
}
@@ -1572,23 +1878,34 @@ hn_txpkt(struct ifnet *ifp, struct hn_tx
again:
/*
- * Make sure that txd is not freed before ETHER_BPF_MTAP.
+ * Make sure that this txd and any aggregated txds are not freed
+ * before ETHER_BPF_MTAP.
*/
hn_txdesc_hold(txd);
error = txr->hn_sendpkt(txr, txd);
if (!error) {
- ETHER_BPF_MTAP(ifp, txd->m);
- if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ if (bpf_peers_present(ifp->if_bpf)) {
+ const struct hn_txdesc *tmp_txd;
+
+ ETHER_BPF_MTAP(ifp, txd->m);
+ STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
+ ETHER_BPF_MTAP(ifp, tmp_txd->m);
+ }
+
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
#ifdef HN_IFSTART_SUPPORT
if (!hn_use_if_start)
#endif
{
if_inc_counter(ifp, IFCOUNTER_OBYTES,
- txd->m->m_pkthdr.len);
- if (txd->m->m_flags & M_MCAST)
- if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
+ txr->hn_stat_size);
+ if (txr->hn_stat_mcasts != 0) {
+ if_inc_counter(ifp, IFCOUNTER_OMCASTS,
+ txr->hn_stat_mcasts);
+ }
}
- txr->hn_pkts++;
+ txr->hn_pkts += txr->hn_stat_pkts;
+ txr->hn_sends++;
}
hn_txdesc_put(txr, txd);
@@ -1628,7 +1945,13 @@ again:
txr->hn_send_failed++;
}
- return error;
+
+ /* Reset temporary stats, after this sending is done. */
+ txr->hn_stat_size = 0;
+ txr->hn_stat_pkts = 0;
+ txr->hn_stat_mcasts = 0;
+
+ return (error);
}
/*
@@ -2412,6 +2735,64 @@ hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARG
}
static int
+hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int error, size;
+
+ size = sc->hn_agg_size;
+ error = sysctl_handle_int(oidp, &size, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+
+ HN_LOCK(sc);
+ sc->hn_agg_size = size;
+ hn_set_txagg(sc);
+ HN_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int error, pkts;
+
+ pkts = sc->hn_agg_pkts;
+ error = sysctl_handle_int(oidp, &pkts, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+
+ HN_LOCK(sc);
+ sc->hn_agg_pkts = pkts;
+ hn_set_txagg(sc);
+ HN_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int pkts;
+
+ pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
+ return (sysctl_handle_int(oidp, &pkts, 0, req));
+}
+
+static int
+hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int align;
+
+ align = sc->hn_tx_ring[0].hn_agg_align;
+ return (sysctl_handle_int(oidp, &align, 0, req));
+}
+
+static int
hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
{
struct hn_softc *sc = arg1;
@@ -2954,6 +3335,7 @@ hn_tx_ring_create(struct hn_softc *sc, i
txd->txr = txr;
txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+ STAILQ_INIT(&txd->agg_list);
/*
* Allocate and load RNDIS packet message.
@@ -3037,6 +3419,8 @@ hn_tx_ring_create(struct hn_softc *sc, i
SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
CTLFLAG_RW, &txr->hn_pkts,
"# of packets transmitted");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
+ CTLFLAG_RW, &txr->hn_sends, "# of sends");
}
}
@@ -3151,6 +3535,11 @@ hn_create_tx_data(struct hn_softc *sc, i
CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
__offsetof(struct hn_tx_ring, hn_txdma_failed),
hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_flush_failed),
+ hn_tx_stat_ulong_sysctl, "LU",
+ "# of packet transmission aggregation flush failure");
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
__offsetof(struct hn_tx_ring, hn_tx_collapsed),
@@ -3187,6 +3576,17 @@ hn_create_tx_data(struct hn_softc *sc, i
CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
+ CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
+ "Applied packet transmission aggregation size");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_txagg_pktmax_sysctl, "I",
+ "Applied packet transmission aggregation packets");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_txagg_align_sysctl, "I",
+ "Applied packet transmission aggregation alignment");
return 0;
}
@@ -3306,18 +3706,20 @@ hn_start_locked(struct hn_tx_ring *txr,
{
struct hn_softc *sc = txr->hn_sc;
struct ifnet *ifp = sc->hn_ifp;
+ int sched = 0;
KASSERT(hn_use_if_start,
("hn_start_locked is called, when if_start is disabled"));
KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+ KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
if (__predict_false(txr->hn_suspended))
- return 0;
+ return (0);
if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
IFF_DRV_RUNNING)
- return 0;
+ return (0);
while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
struct hn_txdesc *txd;
@@ -3335,7 +3737,8 @@ hn_start_locked(struct hn_tx_ring *txr,
* following up packets) to tx taskqueue.
*/
IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
- return 1;
+ sched = 1;
+ break;
}
#if defined(INET6) || defined(INET)
@@ -3356,21 +3759,50 @@ hn_start_locked(struct hn_tx_ring *txr,
break;
}
- error = hn_encap(txr, txd, &m_head);
+ error = hn_encap(ifp, txr, txd, &m_head);
if (error) {
/* Both txd and m_head are freed */
+ KASSERT(txr->hn_agg_txd == NULL,
+ ("encap failed w/ pending aggregating txdesc"));
continue;
}
- error = hn_txpkt(ifp, txr, txd);
- if (__predict_false(error)) {
- /* txd is freed, but m_head is not */
- IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
- atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
- break;
+ if (txr->hn_agg_pktleft == 0) {
+ if (txr->hn_agg_txd != NULL) {
+ KASSERT(m_head == NULL,
+ ("pending mbuf for aggregating txdesc"));
+ error = hn_flush_txagg(ifp, txr);
+ if (__predict_false(error)) {
+ atomic_set_int(&ifp->if_drv_flags,
+ IFF_DRV_OACTIVE);
+ break;
+ }
+ } else {
+ KASSERT(m_head != NULL, ("mbuf was freed"));
+ error = hn_txpkt(ifp, txr, txd);
+ if (__predict_false(error)) {
+ /* txd is freed, but m_head is not */
+ IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+ atomic_set_int(&ifp->if_drv_flags,
+ IFF_DRV_OACTIVE);
+ break;
+ }
+ }
+ }
+#ifdef INVARIANTS
+ else {
+ KASSERT(txr->hn_agg_txd != NULL,
+ ("no aggregating txdesc"));
+ KASSERT(m_head == NULL,
+ ("pending mbuf for aggregating txdesc"));
}
+#endif
}
- return 0;
+
+ /* Flush pending aggerated transmission. */
+ if (txr->hn_agg_txd != NULL)
+ hn_flush_txagg(ifp, txr);
+ return (sched);
}
static void
@@ -3447,18 +3879,20 @@ hn_xmit(struct hn_tx_ring *txr, int len)
struct hn_softc *sc = txr->hn_sc;
struct ifnet *ifp = sc->hn_ifp;
struct mbuf *m_head;
+ int sched = 0;
mtx_assert(&txr->hn_tx_lock, MA_OWNED);
#ifdef HN_IFSTART_SUPPORT
KASSERT(hn_use_if_start == 0,
("hn_xmit is called, when if_start is enabled"));
#endif
+ KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
if (__predict_false(txr->hn_suspended))
- return 0;
+ return (0);
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
- return 0;
+ return (0);
while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
struct hn_txdesc *txd;
@@ -3471,7 +3905,8 @@ hn_xmit(struct hn_tx_ring *txr, int len)
* following up packets) to tx taskqueue.
*/
drbr_putback(ifp, txr->hn_mbuf_br, m_head);
- return 1;
+ sched = 1;
+ break;
}
txd = hn_txdesc_get(txr);
@@ -3482,25 +3917,53 @@ hn_xmit(struct hn_tx_ring *txr, int len)
break;
}
- error = hn_encap(txr, txd, &m_head);
+ error = hn_encap(ifp, txr, txd, &m_head);
if (error) {
/* Both txd and m_head are freed; discard */
+ KASSERT(txr->hn_agg_txd == NULL,
+ ("encap failed w/ pending aggregating txdesc"));
drbr_advance(ifp, txr->hn_mbuf_br);
continue;
}
- error = hn_txpkt(ifp, txr, txd);
- if (__predict_false(error)) {
- /* txd is freed, but m_head is not */
- drbr_putback(ifp, txr->hn_mbuf_br, m_head);
- txr->hn_oactive = 1;
- break;
+ if (txr->hn_agg_pktleft == 0) {
+ if (txr->hn_agg_txd != NULL) {
+ KASSERT(m_head == NULL,
+ ("pending mbuf for aggregating txdesc"));
+ error = hn_flush_txagg(ifp, txr);
+ if (__predict_false(error)) {
+ txr->hn_oactive = 1;
+ break;
+ }
+ } else {
+ KASSERT(m_head != NULL, ("mbuf was freed"));
+ error = hn_txpkt(ifp, txr, txd);
+ if (__predict_false(error)) {
+ /* txd is freed, but m_head is not */
+ drbr_putback(ifp, txr->hn_mbuf_br,
+ m_head);
+ txr->hn_oactive = 1;
+ break;
+ }
+ }
}
+#ifdef INVARIANTS
+ else {
+ KASSERT(txr->hn_agg_txd != NULL,
+ ("no aggregating txdesc"));
+ KASSERT(m_head == NULL,
+ ("pending mbuf for aggregating txdesc"));
+ }
+#endif
/* Sent */
drbr_advance(ifp, txr->hn_mbuf_br);
}
- return 0;
+
+ /* Flush pending aggerated transmission. */
+ if (txr->hn_agg_txd != NULL)
+ hn_flush_txagg(ifp, txr);
+ return (sched);
}
static int
@@ -3978,6 +4441,11 @@ back:
if (error)
return (error);
+ /*
+ * Fixup transmission aggregation setup.
+ */
+ hn_set_txagg(sc);
+
sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
return (0);
}
Modified: stable/11/sys/dev/hyperv/netvsc/if_hnvar.h
==============================================================================
--- stable/11/sys/dev/hyperv/netvsc/if_hnvar.h Thu Jan 5 03:25:16 2017 (r311354)
+++ stable/11/sys/dev/hyperv/netvsc/if_hnvar.h Thu Jan 5 03:25:38 2017 (r311355)
@@ -125,6 +125,22 @@ struct hn_tx_ring {
bus_dma_tag_t hn_tx_data_dtag;
uint64_t hn_csum_assist;
+ /* Applied packet transmission aggregation limits. */
+ int hn_agg_szmax;
+ short hn_agg_pktmax;
+ short hn_agg_align;
+
+ /* Packet transmission aggregation states. */
+ struct hn_txdesc *hn_agg_txd;
+ int hn_agg_szleft;
+ short hn_agg_pktleft;
+ struct rndis_packet_msg *hn_agg_prevpkt;
+
+ /* Temporary stats for each sends. */
+ int hn_stat_size;
+ short hn_stat_pkts;
+ short hn_stat_mcasts;
+
int (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *);
int hn_suspended;
int hn_gpa_cnt;
@@ -137,6 +153,8 @@ struct hn_tx_ring {
u_long hn_tx_chimney_tried;
u_long hn_tx_chimney;
u_long hn_pkts;
+ u_long hn_sends;
+ u_long hn_flush_failed;
/* Rarely used stuffs */
struct hn_txdesc *hn_txdesc;
@@ -180,6 +198,10 @@ struct hn_softc {
uint32_t hn_nvs_ver;
uint32_t hn_rx_filter;
+ /* Packet transmission aggregation user settings. */
+ int hn_agg_size;
+ int hn_agg_pkts;
+
struct taskqueue *hn_mgmt_taskq;
struct taskqueue *hn_mgmt_taskq0;
struct task hn_link_task;
@@ -200,6 +222,9 @@ struct hn_softc {
uint32_t hn_ndis_ver;
int hn_ndis_tso_szmax;
int hn_ndis_tso_sgmin;
+ uint32_t hn_rndis_agg_size;
+ uint32_t hn_rndis_agg_pkts;
+ uint32_t hn_rndis_agg_align;
int hn_rss_ind_size;
uint32_t hn_rss_hash; /* NDIS_HASH_ */
More information about the svn-src-stable
mailing list