svn commit: r265582 - in stable/9/sys/dev/cxgbe: . common tom
Navdeep Parhar
np at FreeBSD.org
Wed May 7 18:15:21 UTC 2014
Author: np
Date: Wed May 7 18:15:20 2014
New Revision: 265582
URL: http://svnweb.freebsd.org/changeset/base/265582
Log:
MFC r259527, r260210 (by adrian@), r261533, r261536, r261537, r261558 (by
scottl@), r263317, r263412, r263457, and r264621 (by emax@).
r259527:
Do not create a hardware IPv6 server if the listen address is not
in6addr_any and is not in the CLIP table either. This fixes a reported
TOE+IPv6 NULL-dereference panic in do_pass_open_rpl().
While here, stop creating hardware servers for any loopback address.
It's just a waste of server tids.
r260210:
Add an option to enable or disable the small RX packet copying that
is done to improve performance of small frames.
When doing RX packing, the RX copying isn't necessarily required.
r261533:
cxgbe(4): Use the port's tx channel to identify it to t4_clr_port_stats.
r261536:
cxgbe(4): The T5 allows for a different freelist starvation threshold
for queues with buffer packing. Use the correct value to calculate a
freelist's low water mark.
r261537:
cxgbe(4): Use the rx channel map (instead of the tx channel map) as the
congestion channel map.
r261558:
Add a new sysctl, dev.cxgbe.N.rsrv_noflow, and a companion tunable,
hw.cxgbe.rsrv_noflow. When set, queue 0 of the port is reserved for
TX packets without a flowid. The hash value of packets with a flowid
is bumped up by 1. The intent is to provide a private queue for
link-level packets like LACP that is unlikely to overflow or suffer
deep queue latency.
r263317:
cxgbe(4): significant rx rework.
- More flexible cluster size selection, including the ability to fall
back to a safe cluster size (PAGE_SIZE from zone_jumbop by default) in
case an allocation of a larger size fails.
- A single get_fl_payload() function that assembles the payload into an
mbuf chain for any kind of freelist. This replaces two variants: one
for freelists with buffer packing enabled and another for those without.
- Buffer packing with any sized cluster. It was limited to 4K clusters
only before this change.
- Enable buffer packing for TOE rx queues as well.
- Statistics and tunables to go with all these changes. The driver's
man page will be updated separately.
r263412:
cxgbe(4): if_iqdrops statistic should include tunnel congestion drops.
r263457:
cxgbe(4): Recognize the "spider" configuration where a T5 card's 40G
QSFP port is presented as 4 distinct 10G SFP+ ports to the driver.
r264621:
use correct (integer) type for the temperature sysctl
Modified:
stable/9/sys/dev/cxgbe/adapter.h
stable/9/sys/dev/cxgbe/common/t4_hw.c
stable/9/sys/dev/cxgbe/common/t4_hw.h
stable/9/sys/dev/cxgbe/t4_main.c
stable/9/sys/dev/cxgbe/t4_sge.c
stable/9/sys/dev/cxgbe/tom/t4_listen.c
stable/9/sys/dev/cxgbe/tom/t4_tom.h
Directory Properties:
stable/9/sys/ (props changed)
stable/9/sys/dev/ (props changed)
Modified: stable/9/sys/dev/cxgbe/adapter.h
==============================================================================
--- stable/9/sys/dev/cxgbe/adapter.h Wed May 7 18:15:02 2014 (r265581)
+++ stable/9/sys/dev/cxgbe/adapter.h Wed May 7 18:15:20 2014 (r265582)
@@ -128,10 +128,11 @@ enum {
RX_FL_ESIZE = EQ_ESIZE, /* 8 64bit addresses */
#if MJUMPAGESIZE != MCLBYTES
- FL_BUF_SIZES_MAX = 5, /* cluster, jumbop, jumbo9k, jumbo16k, extra */
+ SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */
#else
- FL_BUF_SIZES_MAX = 4, /* cluster, jumbo9k, jumbo16k, extra */
+ SW_ZONE_SIZES = 3, /* cluster, jumbo9k, jumbo16k */
#endif
+ CL_METADATA_SIZE = CACHE_LINE_SIZE,
CTRL_EQ_QSIZE = 128,
@@ -203,10 +204,12 @@ struct port_info {
uint8_t mod_type;
uint8_t port_id;
uint8_t tx_chan;
+ uint8_t rx_chan_map; /* rx MPS channel bitmap */
/* These need to be int as they are used in sysctl */
int ntxq; /* # of tx queues */
int first_txq; /* index of first tx queue */
+ int rsrv_noflowq; /* Reserve queue 0 for non-flowid packets */
int nrxq; /* # of rx queues */
int first_rxq; /* index of first rx queue */
#ifdef TCP_OFFLOAD
@@ -232,15 +235,28 @@ struct port_info {
uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */
};
-struct fl_sdesc {
- bus_dmamap_t map;
- caddr_t cl;
- uint8_t tag_idx; /* the fl->tag entry this map comes from */
+/* Where the cluster came from, how it has been carved up. */
+struct cluster_layout {
+ int8_t zidx;
+ int8_t hwidx;
+ uint16_t region1; /* mbufs laid out within this region */
+ /* region2 is the DMA region */
+ uint16_t region3; /* cluster_metadata within this region */
+};
+
+struct cluster_metadata {
+ u_int refcount;
#ifdef INVARIANTS
- __be64 ba_hwtag;
+ struct fl_sdesc *sd; /* For debug only. Could easily be stale */
#endif
};
+struct fl_sdesc {
+ caddr_t cl;
+ uint8_t nmbuf;
+ struct cluster_layout cll;
+};
+
struct tx_desc {
__be64 flit[8];
};
@@ -359,17 +375,19 @@ struct sge_eq {
uint32_t unstalled; /* recovered from stall */
};
-struct fl_buf_info {
- u_int size;
- int type;
- int hwtag:4; /* tag in low 4 bits of the pa. */
- uma_zone_t zone;
-};
-#define FL_BUF_SIZES(sc) (sc->sge.fl_buf_sizes)
-#define FL_BUF_SIZE(sc, x) (sc->sge.fl_buf_info[x].size)
-#define FL_BUF_TYPE(sc, x) (sc->sge.fl_buf_info[x].type)
-#define FL_BUF_HWTAG(sc, x) (sc->sge.fl_buf_info[x].hwtag)
-#define FL_BUF_ZONE(sc, x) (sc->sge.fl_buf_info[x].zone)
+struct sw_zone_info {
+ uma_zone_t zone; /* zone that this cluster comes from */
+ int size; /* size of cluster: 2K, 4K, 9K, 16K, etc. */
+ int type; /* EXT_xxx type of the cluster */
+ int8_t head_hwidx;
+ int8_t tail_hwidx;
+};
+
+struct hw_buf_info {
+ int8_t zidx; /* backpointer to zone; -ve means unused */
+ int8_t next; /* next hwidx for this zone; -1 means no more */
+ int size;
+};
enum {
FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */
@@ -383,9 +401,8 @@ enum {
struct sge_fl {
bus_dma_tag_t desc_tag;
bus_dmamap_t desc_map;
- bus_dma_tag_t tag[FL_BUF_SIZES_MAX]; /* only first FL_BUF_SIZES(sc) are
- valid */
- uint8_t tag_idx;
+ struct cluster_layout cll_def; /* default refill zone, layout */
+ struct cluster_layout cll_alt; /* alternate refill zone, layout */
struct mtx fl_lock;
char lockname[16];
int flags;
@@ -402,9 +419,17 @@ struct sge_fl {
uint32_t needed; /* # of buffers needed to fill up fl. */
uint32_t lowat; /* # of buffers <= this means fl needs help */
uint32_t pending; /* # of bufs allocated since last doorbell */
- u_int dmamap_failed;
- struct mbuf *mstash[8];
TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
+
+ struct mbuf *m0;
+ struct mbuf **pnext;
+ u_int remaining;
+
+ uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */
+ uint64_t mbuf_inlined; /* # of mbuf created within clusters */
+ uint64_t cl_allocated; /* # of clusters allocated */
+ uint64_t cl_recycled; /* # of clusters recycled */
+ uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */
};
/* txq: SGE egress queue + what's needed for Ethernet NIC */
@@ -510,6 +535,7 @@ struct sge {
int timer_val[SGE_NTIMERS];
int counter_val[SGE_NCOUNTERS];
int fl_starve_threshold;
+ int fl_starve_threshold2;
int eq_s_qpp;
int iq_s_qpp;
@@ -537,8 +563,11 @@ struct sge {
struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */
struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */
- u_int fl_buf_sizes __aligned(CACHE_LINE_SIZE);
- struct fl_buf_info fl_buf_info[FL_BUF_SIZES_MAX];
+ int pack_boundary;
+ int8_t safe_hwidx1; /* may not have room for metadata */
+ int8_t safe_hwidx2; /* with room for metadata and maybe more */
+ struct sw_zone_info sw_zone_info[SW_ZONE_SIZES];
+ struct hw_buf_info hw_buf_info[SGE_FLBUF_SIZES];
};
struct rss_header;
@@ -629,6 +658,8 @@ struct adapter {
const char *last_op;
const void *last_op_thr;
#endif
+
+ int sc_do_rxcopy;
};
#define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock)
Modified: stable/9/sys/dev/cxgbe/common/t4_hw.c
==============================================================================
--- stable/9/sys/dev/cxgbe/common/t4_hw.c Wed May 7 18:15:02 2014 (r265581)
+++ stable/9/sys/dev/cxgbe/common/t4_hw.c Wed May 7 18:15:20 2014 (r265582)
@@ -5645,6 +5645,7 @@ int __devinit t4_port_init(struct port_i
p->viid = ret;
p->tx_chan = j;
+ p->rx_chan_map = get_mps_bg_map(adap, j);
p->lport = j;
p->rss_size = rss_size;
t4_os_set_hw_addr(adap, p->port_id, addr);
Modified: stable/9/sys/dev/cxgbe/common/t4_hw.h
==============================================================================
--- stable/9/sys/dev/cxgbe/common/t4_hw.h Wed May 7 18:15:02 2014 (r265581)
+++ stable/9/sys/dev/cxgbe/common/t4_hw.h Wed May 7 18:15:20 2014 (r265582)
@@ -86,6 +86,7 @@ enum {
SGE_NTIMERS = 6, /* # of interrupt holdoff timer values */
SGE_NCOUNTERS = 4, /* # of interrupt packet counter values */
SGE_MAX_IQ_SIZE = 65520,
+ SGE_FLBUF_SIZES = 16,
};
struct sge_qstat { /* data written to SGE queue status entries */
Modified: stable/9/sys/dev/cxgbe/t4_main.c
==============================================================================
--- stable/9/sys/dev/cxgbe/t4_main.c Wed May 7 18:15:02 2014 (r265581)
+++ stable/9/sys/dev/cxgbe/t4_main.c Wed May 7 18:15:20 2014 (r265582)
@@ -197,6 +197,9 @@ TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1
static int t4_nrxq1g = -1;
TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g);
+static int t4_rsrv_noflowq = 0;
+TUNABLE_INT("hw.cxgbe.rsrv_noflowq", &t4_rsrv_noflowq);
+
#ifdef TCP_OFFLOAD
#define NOFLDTXQ_10G 8
static int t4_nofldtxq10g = -1;
@@ -299,6 +302,7 @@ struct intrs_and_queues {
int nrxq10g; /* # of NIC rxq's for each 10G port */
int ntxq1g; /* # of NIC txq's for each 1G port */
int nrxq1g; /* # of NIC rxq's for each 1G port */
+ int rsrv_noflowq; /* Flag whether to reserve queue 0 */
#ifdef TCP_OFFLOAD
int nofldtxq10g; /* # of TOE txq's for each 10G port */
int nofldrxq10g; /* # of TOE rxq's for each 10G port */
@@ -375,6 +379,7 @@ static int cxgbe_sysctls(struct port_inf
static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
static int sysctl_bitfield(SYSCTL_HANDLER_ARGS);
static int sysctl_btphy(SYSCTL_HANDLER_ARGS);
+static int sysctl_noflowq(SYSCTL_HANDLER_ARGS);
static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS);
static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS);
static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS);
@@ -489,6 +494,8 @@ CTASSERT(offsetof(struct sge_ofld_rxq, f
CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS);
CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES);
+CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE);
+
static int
t4_probe(device_t dev)
{
@@ -774,6 +781,11 @@ t4_attach(device_t dev)
pi->ntxq = iaq.ntxq1g;
}
+ if (pi->ntxq > 1)
+ pi->rsrv_noflowq = iaq.rsrv_noflowq ? 1 : 0;
+ else
+ pi->rsrv_noflowq = 0;
+
rqidx += pi->nrxq;
tqidx += pi->ntxq;
@@ -1252,7 +1264,8 @@ cxgbe_transmit(struct ifnet *ifp, struct
}
if (m->m_flags & M_FLOWID)
- txq += (m->m_pkthdr.flowid % pi->ntxq);
+ txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq))
+ + pi->rsrv_noflowq);
br = txq->br;
if (TXQ_TRYLOCK(txq) == 0) {
@@ -1704,6 +1717,7 @@ cfg_itype_and_nqueues(struct adapter *sc
iaq->ntxq1g = t4_ntxq1g;
iaq->nrxq10g = nrxq10g = t4_nrxq10g;
iaq->nrxq1g = nrxq1g = t4_nrxq1g;
+ iaq->rsrv_noflowq = t4_rsrv_noflowq;
#ifdef TCP_OFFLOAD
if (is_offload(sc)) {
iaq->nofldtxq10g = t4_nofldtxq10g;
@@ -2624,6 +2638,7 @@ build_medialist(struct port_info *pi)
ifmedia_set(media, m | IFM_10G_CX4);
break;
+ case FW_PORT_TYPE_QSFP_10G:
case FW_PORT_TYPE_SFP:
case FW_PORT_TYPE_FIBER_XFI:
case FW_PORT_TYPE_FIBER_XAUI:
@@ -4029,6 +4044,7 @@ static void
cxgbe_tick(void *arg)
{
struct port_info *pi = arg;
+ struct adapter *sc = pi->adapter;
struct ifnet *ifp = pi->ifp;
struct sge_txq *txq;
int i, drops;
@@ -4040,7 +4056,7 @@ cxgbe_tick(void *arg)
return; /* without scheduling another callout */
}
- t4_get_port_stats(pi->adapter, pi->tx_chan, s);
+ t4_get_port_stats(sc, pi->tx_chan, s);
ifp->if_opackets = s->tx_frames - s->tx_pause;
ifp->if_ipackets = s->rx_frames - s->rx_pause;
@@ -4051,6 +4067,19 @@ cxgbe_tick(void *arg)
ifp->if_iqdrops = s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 +
s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 +
s->rx_trunc3;
+ for (i = 0; i < 4; i++) {
+ if (pi->rx_chan_map & (1 << i)) {
+ uint32_t v;
+
+ /*
+ * XXX: indirect reads from the same ADDR/DATA pair can
+ * race with each other.
+ */
+ t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v,
+ 1, A_TP_MIB_TNL_CNG_DROP_0 + i);
+ ifp->if_iqdrops += v;
+ }
+ }
drops = s->tx_drop;
for_each_txq(pi, i, txq)
@@ -4197,6 +4226,10 @@ t4_sysctls(struct adapter *sc)
oid = device_get_sysctl_tree(sc->dev);
c0 = children = SYSCTL_CHILDREN(oid);
+ sc->sc_do_rxcopy = 1;
+ SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW,
+ &sc->sc_do_rxcopy, 1, "Do RX copy of small frames");
+
SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL,
sc->params.nports, "# of ports");
@@ -4257,7 +4290,7 @@ t4_sysctls(struct adapter *sc)
NULL, sc->tids.nftids, "number of filters");
SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT |
- CTLFLAG_RD, sc, 0, sysctl_temperature, "A",
+ CTLFLAG_RD, sc, 0, sysctl_temperature, "I",
"chip temperature (in Celsius)");
t4_sge_sysctls(sc, ctx, children);
@@ -4498,6 +4531,9 @@ cxgbe_sysctls(struct port_info *pi)
&pi->first_rxq, 0, "index of first rx queue");
SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD,
&pi->first_txq, 0, "index of first tx queue");
+ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT |
+ CTLFLAG_RW, pi, 0, sysctl_noflowq, "IU",
+ "Reserve queue 0 for non-flowid packets");
#ifdef TCP_OFFLOAD
if (is_offload(sc)) {
@@ -4752,6 +4788,25 @@ sysctl_btphy(SYSCTL_HANDLER_ARGS)
}
static int
+sysctl_noflowq(SYSCTL_HANDLER_ARGS)
+{
+ struct port_info *pi = arg1;
+ int rc, val;
+
+ val = pi->rsrv_noflowq;
+ rc = sysctl_handle_int(oidp, &val, 0, req);
+ if (rc != 0 || req->newptr == NULL)
+ return (rc);
+
+ if ((val >= 1) && (pi->ntxq > 1))
+ pi->rsrv_noflowq = 1;
+ else
+ pi->rsrv_noflowq = 0;
+
+ return (rc);
+}
+
+static int
sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS)
{
struct port_info *pi = arg1;
@@ -7728,11 +7783,11 @@ t4_ioctl(struct cdev *dev, unsigned long
if (port_id >= sc->params.nports)
return (EINVAL);
+ pi = sc->port[port_id];
/* MAC stats */
- t4_clr_port_stats(sc, port_id);
+ t4_clr_port_stats(sc, pi->tx_chan);
- pi = sc->port[port_id];
if (pi->flags & PORT_INIT_DONE) {
struct sge_rxq *rxq;
struct sge_txq *txq;
Modified: stable/9/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- stable/9/sys/dev/cxgbe/t4_sge.c Wed May 7 18:15:02 2014 (r265581)
+++ stable/9/sys/dev/cxgbe/t4_sge.c Wed May 7 18:15:20 2014 (r265582)
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kdb.h>
#include <sys/malloc.h>
#include <sys/queue.h>
+#include <sys/sbuf.h>
#include <sys/taskqueue.h>
#include <sys/sysctl.h>
#include <sys/smp.h>
@@ -50,6 +51,8 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <machine/md_var.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
#include "common/common.h"
#include "common/t4_regs.h"
@@ -122,6 +125,27 @@ static int t4_fl_pack;
static int t5_fl_pack;
TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
+/*
+ * Allow the driver to create mbuf(s) in a cluster allocated for rx.
+ * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
+ * 1: ok to create mbuf(s) within a cluster if there is room.
+ */
+static int allow_mbufs_in_cluster = 1;
+TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
+
+/*
+ * Largest rx cluster size that the driver is allowed to allocate.
+ */
+static int largest_rx_cluster = MJUM16BYTES;
+TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
+
+/*
+ * Size of cluster allocation that's most likely to succeed. The driver will
+ * fall back to this size if it fails to allocate clusters larger than this.
+ */
+static int safest_rx_cluster = PAGE_SIZE;
+TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
+
/* Used to track coalesced tx work request */
struct txpkts {
uint64_t *flitp; /* ptr to flit where next pkt should start */
@@ -138,9 +162,7 @@ struct sgl {
};
static int service_iq(struct sge_iq *, int);
-static struct mbuf *get_fl_payload1(struct adapter *, struct sge_fl *, uint32_t,
- int *);
-static struct mbuf *get_fl_payload2(struct adapter *, struct sge_fl *, uint32_t,
+static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t,
int *);
static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
@@ -156,6 +178,8 @@ static int free_ring(struct adapter *, b
static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *,
int, int);
static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *);
+static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
+ struct sge_fl *);
static int alloc_fwq(struct adapter *);
static int free_fwq(struct adapter *);
static int alloc_mgmtq(struct adapter *);
@@ -189,7 +213,8 @@ static int refill_fl(struct adapter *, s
static void refill_sfl(void *);
static int alloc_fl_sdesc(struct sge_fl *);
static void free_fl_sdesc(struct adapter *, struct sge_fl *);
-static void set_fl_tag_idx(struct adapter *, struct sge_fl *, int);
+static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
+static void find_safe_refill_source(struct adapter *, struct sge_fl *);
static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
@@ -214,6 +239,7 @@ static int handle_fw_msg(struct sge_iq *
struct mbuf *);
static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
+static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
/*
* Called on MOD_LOAD. Validates and calculates the SGE tunables.
@@ -262,7 +288,7 @@ t4_sge_modload(void)
/* T5's pack boundary is independent of the pad boundary. */
if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
!powerof2(fl_pack))
- t5_fl_pack = max(pad, 64);
+ t5_fl_pack = max(pad, CACHE_LINE_SIZE);
else
t5_fl_pack = fl_pack;
@@ -312,14 +338,18 @@ t4_tweak_chip_settings(struct adapter *s
int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
- int sw_flbuf_sizes[] = {
+ static int sge_flbuf_sizes[] = {
MCLBYTES,
#if MJUMPAGESIZE != MCLBYTES
MJUMPAGESIZE,
+ MJUMPAGESIZE - CL_METADATA_SIZE,
+ MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
#endif
MJUM9BYTES,
MJUM16BYTES,
- MJUMPAGESIZE - MSIZE
+ MCLBYTES - MSIZE - CL_METADATA_SIZE,
+ MJUM9BYTES - CL_METADATA_SIZE,
+ MJUM16BYTES - CL_METADATA_SIZE,
};
KASSERT(sc->flags & MASTER_PF,
@@ -357,9 +387,11 @@ t4_tweak_chip_settings(struct adapter *s
V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
- for (i = 0; i < min(nitems(sw_flbuf_sizes), 16); i++) {
+ KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
+ ("%s: hw buffer size table too big", __func__));
+ for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
- sw_flbuf_sizes[i]);
+ sge_flbuf_sizes[i]);
}
v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
@@ -414,6 +446,18 @@ t4_tweak_chip_settings(struct adapter *s
}
/*
+ * SGE wants the buffer to be at least 64B and then a multiple of the pad
+ * boundary or 16, whichever is greater.
+ */
+static inline int
+hwsz_ok(int hwsz)
+{
+ int mask = max(fl_pad, 16) - 1;
+
+ return (hwsz >= 64 && (hwsz & mask) == 0);
+}
+
+/*
* XXX: driver really should be able to deal with unexpected settings.
*/
int
@@ -423,7 +467,7 @@ t4_read_chip_settings(struct adapter *sc
int i, j, n, rc = 0;
uint32_t m, v, r;
uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
- uint32_t sge_flbuf_sizes[16], sw_flbuf_sizes[] = {
+ static int sw_buf_sizes[] = { /* Sorted by size */
MCLBYTES,
#if MJUMPAGESIZE != MCLBYTES
MJUMPAGESIZE,
@@ -431,6 +475,8 @@ t4_read_chip_settings(struct adapter *sc
MJUM9BYTES,
MJUM16BYTES
};
+ struct sw_zone_info *swz, *safe_swz;
+ struct hw_buf_info *hwb;
m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
@@ -461,6 +507,7 @@ t4_read_chip_settings(struct adapter *sc
rc = EINVAL;
}
}
+ s->pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack;
v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
@@ -476,45 +523,93 @@ t4_read_chip_settings(struct adapter *sc
rc = EINVAL;
}
- /*
- * Make a list of SGE FL buffer sizes programmed in the chip and tally
- * it with the FL buffer sizes that we'd like to use.
- */
- n = 0;
- for (i = 0; i < nitems(sge_flbuf_sizes); i++) {
+ /* Filter out unusable hw buffer sizes entirely (mark with -2). */
+ hwb = &s->hw_buf_info[0];
+ for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
- sge_flbuf_sizes[i] = r;
- if (r == MJUMPAGESIZE - MSIZE &&
- (sc->flags & BUF_PACKING_OK) == 0) {
- sc->flags |= BUF_PACKING_OK;
- FL_BUF_HWTAG(sc, n) = i;
- FL_BUF_SIZE(sc, n) = MJUMPAGESIZE - MSIZE;
- FL_BUF_TYPE(sc, n) = m_gettype(MJUMPAGESIZE);
- FL_BUF_ZONE(sc, n) = m_getzone(MJUMPAGESIZE);
- n++;
- }
+ hwb->size = r;
+ hwb->zidx = hwsz_ok(r) ? -1 : -2;
+ hwb->next = -1;
}
- for (i = 0; i < nitems(sw_flbuf_sizes); i++) {
- for (j = 0; j < nitems(sge_flbuf_sizes); j++) {
- if (sw_flbuf_sizes[i] != sge_flbuf_sizes[j])
+
+ /*
+ * Create a sorted list in decreasing order of hw buffer sizes (and so
+ * increasing order of spare area) for each software zone.
+ */
+ n = 0; /* no usable buffer size to begin with */
+ swz = &s->sw_zone_info[0];
+ safe_swz = NULL;
+ for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
+ int8_t head = -1, tail = -1;
+
+ swz->size = sw_buf_sizes[i];
+ swz->zone = m_getzone(swz->size);
+ swz->type = m_gettype(swz->size);
+
+ if (swz->size == safest_rx_cluster)
+ safe_swz = swz;
+
+ hwb = &s->hw_buf_info[0];
+ for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
+ if (hwb->zidx != -1 || hwb->size > swz->size)
continue;
- FL_BUF_HWTAG(sc, n) = j;
- FL_BUF_SIZE(sc, n) = sw_flbuf_sizes[i];
- FL_BUF_TYPE(sc, n) = m_gettype(sw_flbuf_sizes[i]);
- FL_BUF_ZONE(sc, n) = m_getzone(sw_flbuf_sizes[i]);
+ hwb->zidx = i;
+ if (head == -1)
+ head = tail = j;
+ else if (hwb->size < s->hw_buf_info[tail].size) {
+ s->hw_buf_info[tail].next = j;
+ tail = j;
+ } else {
+ int8_t *cur;
+ struct hw_buf_info *t;
+
+ for (cur = &head; *cur != -1; cur = &t->next) {
+ t = &s->hw_buf_info[*cur];
+ if (hwb->size == t->size) {
+ hwb->zidx = -2;
+ break;
+ }
+ if (hwb->size > t->size) {
+ hwb->next = *cur;
+ *cur = j;
+ break;
+ }
+ }
+ }
+ }
+ swz->head_hwidx = head;
+ swz->tail_hwidx = tail;
+
+ if (tail != -1) {
n++;
- break;
+ if (swz->size - s->hw_buf_info[tail].size >=
+ CL_METADATA_SIZE)
+ sc->flags |= BUF_PACKING_OK;
}
}
if (n == 0) {
device_printf(sc->dev, "no usable SGE FL buffer size.\n");
rc = EINVAL;
- } else if (n == 1 && (sc->flags & BUF_PACKING_OK)) {
- device_printf(sc->dev,
- "no usable SGE FL buffer size when not packing buffers.\n");
- rc = EINVAL;
}
- FL_BUF_SIZES(sc) = n;
+
+ s->safe_hwidx1 = -1;
+ s->safe_hwidx2 = -1;
+ if (safe_swz != NULL) {
+ s->safe_hwidx1 = safe_swz->head_hwidx;
+ for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
+ int spare;
+
+ hwb = &s->hw_buf_info[i];
+ spare = safe_swz->size - hwb->size;
+ if (spare < CL_METADATA_SIZE)
+ continue;
+ if (s->safe_hwidx2 == -1 ||
+ spare == CL_METADATA_SIZE + MSIZE)
+ s->safe_hwidx2 = i;
+ if (spare >= CL_METADATA_SIZE + MSIZE)
+ break;
+ }
+ }
r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD);
s->counter_val[0] = G_THRESHOLD_0(r);
@@ -568,6 +663,10 @@ t4_read_chip_settings(struct adapter *sc
r = t4_read_reg(sc, A_SGE_CONM_CTRL);
s->fl_starve_threshold = G_EGRTHRESHOLD(r) * 2 + 1;
+ if (is_t4(sc))
+ s->fl_starve_threshold2 = s->fl_starve_threshold;
+ else
+ s->fl_starve_threshold2 = G_EGRTHRESHOLDPACKING(r) * 2 + 1;
/* egress queues: log2 of # of doorbells per BAR2 page */
r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF);
@@ -622,6 +721,10 @@ t4_sge_sysctls(struct adapter *sc, struc
struct sysctl_oid_list *children)
{
+ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
+ CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
+ "freelist buffer sizes");
+
SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)");
@@ -639,8 +742,7 @@ t4_sge_sysctls(struct adapter *sc, struc
"pack multiple frames in one fl buffer");
SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
- NULL, is_t5(sc) ? t5_fl_pack : t4_fl_pack,
- "payload pack boundary (bytes)");
+ NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)");
}
int
@@ -760,7 +862,7 @@ port_intr_iq(struct port_info *pi, int i
#ifdef TCP_OFFLOAD
if (sc->flags & INTR_DIRECT) {
idx %= pi->nrxq + pi->nofldrxq;
-
+
if (idx >= pi->nrxq) {
idx -= pi->nrxq;
iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq;
@@ -791,29 +893,28 @@ port_intr_iq(struct port_info *pi, int i
return (iq);
}
+/* Maximum payload that can be delivered with a single iq descriptor */
static inline int
-mtu_to_bufsize(int mtu)
+mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
{
- int bufsize;
-
- /* large enough for a frame even when VLAN extraction is disabled */
- bufsize = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu;
- bufsize = roundup2(bufsize + fl_pktshift, fl_pad);
-
- return (bufsize);
-}
+ int payload;
#ifdef TCP_OFFLOAD
-static inline int
-mtu_to_bufsize_toe(struct adapter *sc, int mtu)
-{
-
- if (sc->tt.rx_coalesce)
- return (G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)));
+ if (toe) {
+ payload = sc->tt.rx_coalesce ?
+ G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
+ } else {
+#endif
+ /* large enough even when hw VLAN extraction is disabled */
+ payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN +
+ mtu;
+#ifdef TCP_OFFLOAD
+ }
+#endif
+ payload = roundup2(payload, fl_pad);
- return (mtu);
+ return (payload);
}
-#endif
int
t4_setup_port_queues(struct port_info *pi)
@@ -832,7 +933,7 @@ t4_setup_port_queues(struct port_info *p
struct ifnet *ifp = pi->ifp;
struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
- int bufsize, pack;
+ int maxp, pack, mtu = ifp->if_mtu;
oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
NULL, "rx queues");
@@ -853,7 +954,7 @@ t4_setup_port_queues(struct port_info *p
* a) initialize iq and fl
* b) allocate queue iff it will take direct interrupts.
*/
- bufsize = mtu_to_bufsize(ifp->if_mtu);
+ maxp = mtu_to_max_payload(sc, mtu, 0);
pack = enable_buffer_packing(sc);
for_each_rxq(pi, i, rxq) {
@@ -862,7 +963,7 @@ t4_setup_port_queues(struct port_info *p
snprintf(name, sizeof(name), "%s rxq%d-fl",
device_get_nameunit(pi->dev), i);
- init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, bufsize, pack, name);
+ init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, pack, name);
if (sc->flags & INTR_DIRECT
#ifdef TCP_OFFLOAD
@@ -878,8 +979,7 @@ t4_setup_port_queues(struct port_info *p
}
#ifdef TCP_OFFLOAD
- bufsize = mtu_to_bufsize_toe(sc, ifp->if_mtu);
- pack = 0; /* XXX: think about this some more */
+ maxp = mtu_to_max_payload(sc, mtu, 1);
for_each_ofld_rxq(pi, i, ofld_rxq) {
init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
@@ -887,8 +987,7 @@ t4_setup_port_queues(struct port_info *p
snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
device_get_nameunit(pi->dev), i);
- init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, pack,
- name);
+ init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, pack, name);
if (sc->flags & INTR_DIRECT ||
(sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) {
@@ -1162,10 +1261,7 @@ service_iq(struct sge_iq *iq, int budget
("%s: data for an iq (%p) with no freelist",
__func__, iq));
- m0 = fl->flags & FL_BUF_PACKING ?
- get_fl_payload1(sc, fl, lq, &fl_bufs_used) :
- get_fl_payload2(sc, fl, lq, &fl_bufs_used);
-
+ m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used);
if (__predict_false(m0 == NULL))
goto process_iql;
#ifdef T4_PKT_TIMESTAMP
@@ -1222,6 +1318,14 @@ service_iq(struct sge_iq *iq, int budget
break;
}
+ if (fl_bufs_used >= 16) {
+ FL_LOCK(fl);
+ fl->needed += fl_bufs_used;
+ refill_fl(sc, fl, 32);
+ FL_UNLOCK(fl);
+ fl_bufs_used = 0;
+ }
+
iq_next(iq);
if (++ndescs == limit) {
t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
@@ -1230,14 +1334,6 @@ service_iq(struct sge_iq *iq, int budget
V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
ndescs = 0;
- if (fl_bufs_used > 0) {
- FL_LOCK(fl);
- fl->needed += fl_bufs_used;
- refill_fl(sc, fl, fl->cap / 8);
- FL_UNLOCK(fl);
- fl_bufs_used = 0;
- }
-
if (budget)
return (EINPROGRESS);
}
@@ -1280,7 +1376,7 @@ process_iql:
FL_LOCK(fl);
fl->needed += fl_bufs_used;
- starved = refill_fl(sc, fl, fl->cap / 4);
+ starved = refill_fl(sc, fl, 64);
FL_UNLOCK(fl);
if (__predict_false(starved != 0))
add_fl_to_sfl(sc, fl);
@@ -1289,74 +1385,28 @@ process_iql:
return (0);
}
-static int
-fill_mbuf_stash(struct sge_fl *fl)
-{
- int i;
-
- for (i = 0; i < nitems(fl->mstash); i++) {
- if (fl->mstash[i] == NULL) {
- struct mbuf *m;
- if ((m = m_get(M_NOWAIT, MT_NOINIT)) == NULL)
- return (ENOBUFS);
- fl->mstash[i] = m;
- }
- }
- return (0);
-}
-
-static struct mbuf *
-get_mbuf_from_stash(struct sge_fl *fl)
+static inline int
+cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
{
- int i;
+ int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
- for (i = 0; i < nitems(fl->mstash); i++) {
- if (fl->mstash[i] != NULL) {
- struct mbuf *m;
-
- m = fl->mstash[i];
- fl->mstash[i] = NULL;
- return (m);
- } else
- fl->mstash[i] = m_get(M_NOWAIT, MT_NOINIT);
- }
+ if (rc)
+ MPASS(cll->region3 >= CL_METADATA_SIZE);
- return (m_get(M_NOWAIT, MT_NOINIT));
+ return (rc);
}
-static void
-return_mbuf_to_stash(struct sge_fl *fl, struct mbuf *m)
+static inline struct cluster_metadata *
+cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
+ caddr_t cl)
{
- int i;
- if (m == NULL)
- return;
+ if (cl_has_metadata(fl, cll)) {
+ struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
- for (i = 0; i < nitems(fl->mstash); i++) {
- if (fl->mstash[i] == NULL) {
- fl->mstash[i] = m;
- return;
- }
+ return ((struct cluster_metadata *)(cl + swz->size) - 1);
}
- m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
- m_free(m);
-}
-
-/* buf can be any address within the buffer */
-static inline u_int *
-find_buf_refcnt(caddr_t buf)
-{
- uintptr_t ptr = (uintptr_t)buf;
-
- return ((u_int *)((ptr & ~(MJUMPAGESIZE - 1)) + MSIZE - sizeof(u_int)));
-}
-
-static inline struct mbuf *
-find_buf_mbuf(caddr_t buf)
-{
- uintptr_t ptr = (uintptr_t)buf;
-
- return ((struct mbuf *)(ptr & ~(MJUMPAGESIZE - 1)));
+ return (NULL);
}
static void
@@ -1364,177 +1414,115 @@ rxb_free(void *arg1, void *arg2)
{
uma_zone_t zone = arg1;
caddr_t cl = arg2;
-#ifdef notyet
- u_int refcount;
- refcount = *find_buf_refcnt(cl);
- KASSERT(refcount == 0, ("%s: cl %p refcount is %u", __func__,
- cl - MSIZE, refcount));
-#endif
- cl -= MSIZE;
uma_zfree(zone, cl);
}
+/*
+ * The mbuf returned by this function could be allocated from zone_mbuf or
+ * constructed in spare room in the cluster.
+ *
+ * The mbuf carries the payload in one of these ways
+ * a) frame inside the mbuf (mbuf from zone_mbuf)
+ * b) m_cljset (for clusters without metadata) zone_mbuf
+ * c) m_extaddref (cluster with metadata) inline mbuf
+ * d) m_extaddref (cluster with metadata) zone_mbuf
+ */
static struct mbuf *
-get_fl_payload1(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
- int *fl_bufs_used)
+get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int flags)
{
- struct mbuf *m0, *m;
+ struct mbuf *m;
struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
- unsigned int nbuf, len;
- int pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack;
+ struct cluster_layout *cll = &sd->cll;
+ struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
+ struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
+ struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
+ int len, padded_len;
+ caddr_t payload;
+
+ len = min(total, hwb->size - fl->rx_offset);
+ padded_len = roundup2(len, fl_pad);
+ payload = sd->cl + cll->region1 + fl->rx_offset;
- /*
- * No assertion for the fl lock because we don't need it. This routine
- * is called only from the rx interrupt handler and it only updates
- * fl->cidx. (Contrast that with fl->pidx/fl->needed which could be
- * updated in the rx interrupt handler or the starvation helper routine.
- * That's why code that manipulates fl->pidx/fl->needed needs the fl
- * lock but this routine does not).
- */
+ if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
- KASSERT(fl->flags & FL_BUF_PACKING,
- ("%s: buffer packing disabled for fl %p", __func__, fl));
-
- len = G_RSPD_LEN(len_newbuf);
+ /*
+ * Copy payload into a freshly allocated mbuf.
+ */
- if ((len_newbuf & F_RSPD_NEWBUF) == 0) {
- KASSERT(fl->rx_offset > 0,
- ("%s: packed frame but driver at offset=0", __func__));
-
- /* A packed frame is guaranteed to fit entirely in this buf. */
- KASSERT(FL_BUF_SIZE(sc, sd->tag_idx) - fl->rx_offset >= len,
- ("%s: packing error. bufsz=%u, offset=%u, len=%u",
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-stable-9
mailing list