PERFORCE change 129436 for review
Kip Macy
kmacy at FreeBSD.org
Fri Nov 23 15:32:23 PST 2007
http://perforce.freebsd.org/chv.cgi?CH=129436
Change 129436 by kmacy at kmacy:storage:toestack on 2007/11/23 23:32:18
add initial DDP infrastructure
Affected files ...
.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#23 edit
.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#3 edit
.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_tom.c#12 edit
Differences ...
==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#23 (text+ko) ====
@@ -80,6 +80,7 @@
#include <dev/cxgb/ulp/tom/cxgb_defs.h>
#include <dev/cxgb/ulp/tom/cxgb_tom.h>
#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
/*
* For ULP connections HW may add headers, e.g., for digests, that aren't part
@@ -1155,7 +1156,6 @@
m_free(m);
}
INP_UNLOCK(inp);
-
}
/*
@@ -1444,7 +1444,213 @@
return (0);
}
+static void
+new_rx_data_ddp(struct socket *so, struct mbuf *m)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct ddp_state *q;
+ struct ddp_buf_state *bsp;
+ struct cpl_rx_data_ddp *hdr;
+ unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
+
+#ifdef notyet
+ if (unlikely(sk_no_receive(sk))) {
+ handle_excess_rx(so, m);
+ return;
+ }
+#endif
+ tp = sototcpcb(so);
+ q = &toep->tp_ddp_state;
+ hdr = cplhdr(m);
+ ddp_report = ntohl(hdr->u.ddp_report);
+ buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
+ bsp = &q->buf_state[buf_idx];
+
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(sk),
+ "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
+ "hdr seq 0x%x len %u offset %u",
+ tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
+ ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
+ T3_TRACE1(TIDTB(sk),
+ "new_rx_data_ddp: ddp_report 0x%x",
+ ddp_report);
+#endif
+
+ ddp_len = ntohs(hdr->len);
+ rcv_nxt = ntohl(hdr->seq) + ddp_len;
+
+ /*
+ * Overload to store old rcv_next
+ */
+ m->m_pkthdr.csum_data = tp->rcv_nxt;
+ tp->rcv_nxt = rcv_nxt;
+
+ /*
+ * Store the length in m->m_len. We are changing the meaning of
+ * m->m_len here, we need to be very careful that nothing from now on
+ * interprets ->len of this packet the usual way.
+ */
+ m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;
+
+ /*
+ * Figure out where the new data was placed in the buffer and store it
+ * in when. Assumes the buffer offset starts at 0, consumer needs to
+ * account for page pod's pg_offset.
+ */
+ end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
+#ifdef notyet
+ TCP_SKB_CB(skb)->when = end_offset - skb->len;
+
+ /*
+ * We store in mac.raw the address of the gather list where the
+ * placement happened.
+ */
+ skb->mac.raw = (unsigned char *)bsp->gl;
+#endif
+ bsp->cur_offset = end_offset;
+
+ /*
+ * Bit 0 of flags stores whether the DDP buffer is completed.
+ * Note that other parts of the code depend on this being in bit 0.
+ */
+ if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->length) {
+#if 0
+ TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */
+#endif
+ panic("spurious ddp completion");
+ } else {
+ m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
+ if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
+ q->cur_buf ^= 1; /* flip buffers */
+ }
+
+ if (bsp->flags & DDP_BF_NOCOPY) {
+ m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
+ bsp->flags &= ~DDP_BF_NOCOPY;
+ }
+
+ if (ddp_report & F_DDP_PSH)
+ m->m_pkthdr.csum_flags |= DDP_BF_PSH;
+
+ tp->t_rcvtime = ticks;
+ sbappendstream_locked(&so->so_rcv, m);
+#ifdef notyet
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, 0);
+#endif
+}
+
+#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
+ F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
+ F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
+ F_DDP_INVALID_PPOD)
+
+/*
+ * Handler for RX_DATA_DDP CPL messages.
+ */
+static int
+do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = ctx;
+ struct socket *so = toeptoso(toep);
+ const struct cpl_rx_data_ddp *hdr = cplhdr(m);
+
+ VALIDATE_SOCK(so);
+
+ if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
+ log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
+ GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
+ return CPL_RET_BUF_DONE;
+ }
+#if 0
+ skb->h.th = tcphdr_skb->h.th;
+#endif
+ new_rx_data_ddp(so, m);
+ return (0);
+}
+
+static void
+process_ddp_complete(struct socket *so, struct mbuf *m)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct ddp_state *q;
+ struct ddp_buf_state *bsp;
+ struct cpl_rx_ddp_complete *hdr;
+ unsigned int ddp_report, buf_idx, when;
+
+#ifdef notyet
+ if (unlikely(sk_no_receive(sk))) {
+ handle_excess_rx(sk, skb);
+ return;
+ }
+#endif
+ q = &toep->tp_ddp_state;
+ hdr = cplhdr(m);
+ ddp_report = ntohl(hdr->ddp_report);
+ buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
+ bsp = &q->buf_state[buf_idx];
+
+ when = bsp->cur_offset;
+ m->m_len = G_DDP_OFFSET(ddp_report) - when;
+
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(sk),
+ "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+ "ddp_report 0x%x offset %u, len %u",
+ tp->rcv_nxt, bsp->cur_offset, ddp_report,
+ G_DDP_OFFSET(ddp_report), skb->len);
+#endif
+
+ bsp->cur_offset += m->m_len;
+
+ if (!(bsp->flags & DDP_BF_NOFLIP))
+ q->cur_buf ^= 1; /* flip buffers */
+
+#ifdef T3_TRACE
+ T3_TRACE4(TIDTB(sk),
+ "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+ "ddp_report %u offset %u",
+ tp->rcv_nxt, bsp->cur_offset, ddp_report,
+ G_DDP_OFFSET(ddp_report));
+#endif
+#if 0
+ skb->mac.raw = (unsigned char *)bsp->gl;
+#endif
+ m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
+ if (bsp->flags & DDP_BF_NOCOPY)
+ bsp->flags &= ~DDP_BF_NOCOPY;
+ m->m_pkthdr.csum_data = tp->rcv_nxt;
+ tp->rcv_nxt += m->m_len;
+
+ tp->t_rcvtime = ticks;
+ sbappendstream_locked(&so->so_rcv, m);
+#ifdef notyet
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, 0);
+#endif
+}
+
/*
+ * Handler for RX_DDP_COMPLETE CPL messages.
+ */
+static int
+do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = ctx;
+ struct socket *so = toeptoso(toep);
+
+ VALIDATE_SOCK(so);
+#if 0
+ skb->h.th = tcphdr_skb->h.th;
+#endif
+ process_ddp_complete(so, m);
+ return (0);
+}
+
+/*
* Move a socket to TIME_WAIT state. We need to make some adjustments to the
* socket state before calling tcp_time_wait to comply with its expectations.
*/
@@ -1902,8 +2108,7 @@
struct toepcb *toep = tp->t_toe;
if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
- toep->tp_flags |= TP_ABORT_REQ_RCVD;
- toep->tp_flags |= TP_ABORT_SHUTDOWN;
+ toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
m_free(m);
return;
}
@@ -2055,7 +2260,7 @@
UNIMPLEMENTED();
#ifdef notyet
- struct sock *newso;
+ struct socket *newso;
struct l2t_entry *e;
struct rtentry *dst;
struct tcpcb *newtp;
@@ -3008,11 +3213,10 @@
t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
+ t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
+ t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
#ifdef notyet
-
t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
- t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
- t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
#endif
==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#3 (text+ko) ====
@@ -70,12 +70,14 @@
#include <dev/cxgb/cxgb_l2t.h>
#include <dev/cxgb/cxgb_offload.h>
#include <vm/vm.h>
+#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <dev/cxgb/sys/mvec.h>
#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
#include <dev/cxgb/ulp/tom/cxgb_defs.h>
#include <dev/cxgb/ulp/tom/cxgb_tom.h>
#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
struct uio *uio, struct mbuf *top, struct mbuf *control,
@@ -85,6 +87,7 @@
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
int *flagsp);
+#define TMP_IOV_MAX 16
void
t3_init_socket_ops(void)
@@ -96,30 +99,311 @@
pru_soreceive = prp->pr_usrreqs->pru_soreceive;
}
+
+struct cxgb_dma_info {
+ size_t cdi_mapped;
+ int cdi_nsegs;
+ bus_dma_segment_t *cdi_segs;
+
+};
+
+static void
+cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
+ bus_size_t mapsize, int error)
+{
+ struct cxgb_dma_info *cdi = arg;
+
+ cdi->cdi_mapped = mapsize;
+ cdi->cdi_nsegs = nsegs;
+ cdi->cdi_segs = segs;
+}
+
+static void
+iov_adj(struct iovec **iov, int *iovcnt, size_t count)
+{
+ struct iovec *iovtmp;
+ int iovcnttmp;
+ caddr_t ptmp;
+
+ if (count > 0) {
+ iovtmp = *iov;
+ iovcnttmp = *iovcnt;
+ while (count > 0) {
+ if (count < iovtmp->iov_len) {
+ ptmp = iovtmp->iov_base;
+ ptmp += count;
+ iovtmp->iov_base = ptmp;
+ iovtmp->iov_len -= count;
+ break;
+ } else
+ count -= iovtmp->iov_len;
+ iovtmp++;
+ iovcnttmp--;
+ }
+ *iov = iovtmp;
+ *iovcnt = iovcnttmp;
+ } else if (count < 0) {
+ iovtmp = &(*iov)[*iovcnt - 1];
+ iovcnttmp = *iovcnt;
+ while (count < 0) {
+ if (-count < iovtmp->iov_len) {
+ iovtmp->iov_len += count;
+ break;
+ } else
+ count += iovtmp->iov_len;
+ iovtmp--;
+ iovcnttmp--;
+ }
+ *iovcnt = iovcnttmp;
+ }
+}
+
+
+static void
+cxgb_zero_copy_free(void *cl, void *arg) {}
+
+static int
+cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
+{
+
+ return (EINVAL);
+}
+
+static void
+cxgb_wait_dma_completion(struct toepcb *tp)
+{
+
+}
+
+static int
+cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
+{
+ int i, seg_count, err, type;
+ struct mbuf *m0;
+ struct cxgb_dma_info cdi;
+ struct mbuf_vec *mv;
+ struct mbuf_iovec *mi;
+ bus_dma_segment_t *segs;
+
+ err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
+ cxgb_dma_callback, &cdi, 0);
+
+ if (err)
+ return (err);
+ seg_count = cdi.cdi_nsegs;
+ if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
+ bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
+ return (ENOMEM);
+ }
+ segs = cdi.cdi_segs;
+ m0->m_type = type;
+ m0->m_flags = (M_EXT|M_NOFREE);
+ m0->m_ext.ext_type = EXT_EXTREF;
+ m0->m_ext.ext_free = cxgb_zero_copy_free;
+ m0->m_ext.ext_args = NULL;
+
+ mv = mtomv(m0);
+ mv->mv_count = seg_count;
+ mv->mv_first = 0;
+ for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
+ mi_collapse_sge(mi, segs);
+
+ *m = m0;
+
+ if (cdi.cdi_mapped < uio->uio_resid) {
+ uio->uio_resid -= cdi.cdi_mapped;
+ } else
+ uio->uio_resid = 0;
+
+ return (0);
+}
+
+static int
+t3_sosend(struct socket *so, struct uio *uio)
+{
+ int rv, count, hold_resid, sent, iovcnt;
+ struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct mbuf *m;
+ struct uio uiotmp;
+
+ /*
+ * Events requiring iteration:
+ * - number of pages exceeds max hold pages for process or system
+ * - number of pages exceeds maximum sg entries for a single WR
+ *
+ * We're limited to holding 128 pages at once - and we're limited to
+ * 34 SG entries per work request, but each SG entry can be any number
+ * of contiguous pages
+ *
+ */
+
+ uiotmp = *uio;
+ iovcnt = uio->uio_iovcnt;
+ iov = uio->uio_iov;
+ sent = 0;
+sendmore:
+ /*
+ * Make sure we don't exceed the socket buffer
+ */
+ count = min(toep->tp_page_count, (sbspace(&so->so_snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
+ rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0);
+ hold_resid = uiotmp.uio_resid;
+ if (rv)
+ return (rv);
+
+ /*
+ * Bump past sent and shave off the unheld amount
+ */
+ if (hold_resid > 0) {
+ iovtmpp = iovtmp;
+ memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
+ if (sent)
+ iov_adj(&iovtmpp, &iovcnt, sent);
+ iov_adj(&iovtmpp, &iovcnt, -hold_resid);
+ uiotmp.uio_iov = iovtmpp;
+ uiotmp.uio_iovcnt = iovcnt;
+
+ }
+ uiotmp.uio_resid = uio->uio_resid - hold_resid;
+
+ /*
+ * Push off all held pages
+ *
+ */
+ while (uiotmp.uio_resid > 0) {
+ rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
+ if (rv) {
+ vm_fault_unhold_pages(toep->tp_pages, count);
+ return (rv);
+ }
+ uio->uio_resid -= m->m_pkthdr.len;
+ sent += m->m_pkthdr.len;
+ sbappend_locked(&so->so_snd, m);
+ t3_push_frames(so, TRUE);
+ iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
+ }
+ /*
+ * Wait for pending I/O to be DMA'd to the card
+ *
+ */
+ cxgb_wait_dma_completion(toep);
+ vm_fault_unhold_pages(toep->tp_pages, count);
+ /*
+ * If there is more data to send adjust local copy of iov
+ * to point to teh start
+ */
+ if (hold_resid) {
+ iovtmpp = iovtmp;
+ memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
+ iov_adj(&iovtmpp, &iovcnt, sent);
+ uiotmp = *uio;
+ uiotmp.uio_iov = iovtmpp;
+ uiotmp.uio_iovcnt = iovcnt;
+ goto sendmore;
+ }
+
+ return (0);
+}
+
static int
cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
+ struct toedev *tdev = TOE_DEV(so);
+ int zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
+ int zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
+ int rv;
+ struct tcpcb *tp = sototcpcb(so);
+
/*
- * punt it back to the stack if the overhead of copying is thought to
- * be less than the VM and DMA overhead of setting up page pods
+ * In order to use DMA direct from userspace the following
+ * conditions must be met:
+ * - the connection is currently offloaded
+ * - ddp is enabled
+ * - the number of bytes to be transferred exceeds the threshold
+ * - the number of bytes currently in flight won't exceed the in-flight
+ * threshold XXX TODO
+ * - vm_fault_hold_user_pages succeeds
+ * - blocking socket XXX for now
+ *
+ */
+ if ((tp->t_flags & TF_TOE) && (uio->uio_resid > zcopy_thres) &&
+ (uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0)
+ && zcopy_enabled) {
+ rv = t3_sosend(so, uio);
+ if (rv != EAGAIN)
+ return (rv);
+ }
+
+ return pru_sosend(so, addr, uio, top, control, flags, td);
+}
+
+
+static int
+t3_soreceive(struct socket *so, struct uio *uio)
+{
+#ifdef notyet
+ int i, rv, count, hold_resid, sent, iovcnt;
+ struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct mbuf *m;
+ struct uio uiotmp;
+
+ /*
+ * Events requiring iteration:
+ * - number of pages exceeds max hold pages for process or system
+ * - number of pages exceeds maximum sg entries for a single WR
+ *
+ * We're limited to holding 128 pages at once - and we're limited to
+ * 34 SG entries per work request, but each SG entry can be any number
+ * of contiguous pages
+ *
*/
-#ifdef notyet
- if (uio->uio_resid < (40 << 10) /* XXX use tunable */)
-#endif
- return pru_sosend(so, addr, uio, top, control, flags, td);
-
+ uiotmp = *uio;
+ iovcnt = uio->uio_iovcnt;
+ iov = uio->uio_iov;
+ sent = 0;
+ re;
+#endif
+ return (0);
}
static int
cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
-#ifdef notyet
- if (uio->uio_resid < (40 << 10) /* XXX use tunable */)
-#endif
- return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
+ struct toedev *tdev = TOE_DEV(so);
+ int zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
+ int zcopy_enabled = TOM_TUNABLE(tdev, ddp);
+ int rv;
+ struct tcpcb *tp = sototcpcb(so);
+
+ /*
+ * In order to use DMA direct from userspace the following
+ * conditions must be met:
+ * - the connection is currently offloaded
+ * - ddp is enabled
+ * - the number of bytes to be transferred exceeds the threshold
+ * - the number of bytes currently in flight won't exceed the in-flight
+ * threshold XXX TODO
+ * - vm_fault_hold_user_pages succeeds
+ * - blocking socket XXX for now
+ * - iovcnt is 1
+ *
+ */
+ if ((tp->t_flags & TF_TOE) && (uio->uio_resid > zcopy_thres) &&
+ (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0)
+ && zcopy_enabled) {
+ rv = t3_soreceive(so, uio);
+ if (rv != EAGAIN)
+ return (rv);
+ }
+
+ return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
}
==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_tom.c#12 (text+ko) ====
@@ -77,6 +77,7 @@
#include <dev/cxgb/ulp/tom/cxgb_tom.h>
#include <dev/cxgb/ulp/tom/cxgb_defs.h>
#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
static int activated = 1;
TUNABLE_INT("hw.t3toe.activated", &activated);
More information about the p4-projects
mailing list