git: 6469aab051de - stable/12 - Coalesce socket reads in software iSCSI.
Alexander Motin
mav at FreeBSD.org
Mon Mar 15 03:01:27 UTC 2021
The branch stable/12 has been updated by mav:
URL: https://cgit.FreeBSD.org/src/commit/?id=6469aab051dee71c88e27aa906c18efa09e77189
commit 6469aab051dee71c88e27aa906c18efa09e77189
Author: Alexander Motin <mav at FreeBSD.org>
AuthorDate: 2021-02-22 17:23:35 +0000
Commit: Alexander Motin <mav at FreeBSD.org>
CommitDate: 2021-03-15 02:45:25 +0000
Coalesce socket reads in software iSCSI.
Instead of 2-4 socket reads per PDU this can do as low as one read
per megabyte, dramatically reducing TCP overhead and lock contention.
With this on iSCSI target I can write more than 4GB/s through a
single connection.
MFC after: 1 month
(cherry picked from commit 6895f89fe54e0858aea70d2bd2a9651f45d7998e)
---
sys/dev/iscsi/icl_soft.c | 258 ++++++++++++++++-------------------------------
1 file changed, 89 insertions(+), 169 deletions(-)
diff --git a/sys/dev/iscsi/icl_soft.c b/sys/dev/iscsi/icl_soft.c
index dd9210642ffa..caeddc9247ba 100644
--- a/sys/dev/iscsi/icl_soft.c
+++ b/sys/dev/iscsi/icl_soft.c
@@ -143,68 +143,6 @@ icl_conn_fail(struct icl_conn *ic)
(ic->ic_error)(ic);
}
-static struct mbuf *
-icl_conn_receive(struct icl_conn *ic, size_t len)
-{
- struct uio uio;
- struct socket *so;
- struct mbuf *m;
- int error, flags;
-
- so = ic->ic_socket;
-
- memset(&uio, 0, sizeof(uio));
- uio.uio_resid = len;
-
- flags = MSG_DONTWAIT;
- error = soreceive(so, NULL, &uio, &m, NULL, &flags);
- if (error != 0) {
- ICL_DEBUG("soreceive error %d", error);
- return (NULL);
- }
- if (uio.uio_resid != 0) {
- m_freem(m);
- ICL_DEBUG("short read");
- return (NULL);
- }
-
- return (m);
-}
-
-static int
-icl_conn_receive_buf(struct icl_conn *ic, void *buf, size_t len)
-{
- struct iovec iov[1];
- struct uio uio;
- struct socket *so;
- int error, flags;
-
- so = ic->ic_socket;
-
- memset(&uio, 0, sizeof(uio));
- iov[0].iov_base = buf;
- iov[0].iov_len = len;
- uio.uio_iov = iov;
- uio.uio_iovcnt = 1;
- uio.uio_offset = 0;
- uio.uio_resid = len;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_rw = UIO_READ;
-
- flags = MSG_DONTWAIT;
- error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
- if (error != 0) {
- ICL_DEBUG("soreceive error %d", error);
- return (-1);
- }
- if (uio.uio_resid != 0) {
- ICL_DEBUG("short read");
- return (-1);
- }
-
- return (0);
-}
-
static void
icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
{
@@ -318,37 +256,28 @@ icl_pdu_size(const struct icl_pdu *response)
return (len);
}
-static int
-icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
+static void
+icl_soft_receive_buf(struct mbuf **r, size_t *rs, void *buf, size_t s)
{
- if (icl_conn_receive_buf(request->ip_conn,
- request->ip_bhs, sizeof(struct iscsi_bhs))) {
- ICL_DEBUG("failed to receive BHS");
- return (-1);
- }
-
- *availablep -= sizeof(struct iscsi_bhs);
- return (0);
+ m_copydata(*r, 0, s, buf);
+ m_adj(*r, s);
+ while ((*r) != NULL && (*r)->m_len == 0)
+ *r = m_free(*r);
+ *rs -= s;
}
-static int
-icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
+static void
+icl_pdu_receive_ahs(struct icl_pdu *request, struct mbuf **r, size_t *rs)
{
request->ip_ahs_len = icl_pdu_ahs_length(request);
if (request->ip_ahs_len == 0)
- return (0);
-
- request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
- request->ip_ahs_len);
- if (request->ip_ahs_mbuf == NULL) {
- ICL_DEBUG("failed to receive AHS");
- return (-1);
- }
+ return;
- *availablep -= request->ip_ahs_len;
- return (0);
+ request->ip_ahs_mbuf = *r;
+ *r = m_split(request->ip_ahs_mbuf, request->ip_ahs_len, M_WAITOK);
+ *rs -= request->ip_ahs_len;
}
static uint32_t
@@ -367,7 +296,7 @@ icl_mbuf_to_crc32c(const struct mbuf *m0)
}
static int
-icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
+icl_pdu_check_header_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs)
{
uint32_t received_digest, valid_digest;
@@ -375,12 +304,7 @@ icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
return (0);
CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
- if (icl_conn_receive_buf(request->ip_conn,
- &received_digest, ISCSI_HEADER_DIGEST_SIZE)) {
- ICL_DEBUG("failed to receive header digest");
- return (-1);
- }
- *availablep -= ISCSI_HEADER_DIGEST_SIZE;
+ icl_soft_receive_buf(r, rs, &received_digest, ISCSI_HEADER_DIGEST_SIZE);
/* Temporary attach AHS to BHS to calculate header digest. */
request->ip_bhs_mbuf->m_next = request->ip_ahs_mbuf;
@@ -448,8 +372,8 @@ icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
}
static int
-icl_pdu_receive_data_segment(struct icl_pdu *request,
- size_t *availablep, bool *more_neededp)
+icl_pdu_receive_data_segment(struct icl_pdu *request, struct mbuf **r,
+ size_t *rs, bool *more_neededp)
{
struct icl_conn *ic;
size_t len, padding = 0;
@@ -473,7 +397,7 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
len -= request->ip_data_len;
- if (len + padding > *availablep) {
+ if (len + padding > *rs) {
/*
* Not enough data in the socket buffer. Receive as much
* as we can. Don't receive padding, since, obviously, it's
@@ -481,9 +405,9 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
*/
#if 0
ICL_DEBUG("limited from %zd to %zd",
- len + padding, *availablep - padding));
+ len + padding, *rs - padding));
#endif
- len = *availablep - padding;
+ len = *rs - padding;
*more_neededp = true;
padding = 0;
}
@@ -493,11 +417,9 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
* of actual data segment.
*/
if (len > 0) {
- m = icl_conn_receive(request->ip_conn, len + padding);
- if (m == NULL) {
- ICL_DEBUG("failed to receive data segment");
- return (-1);
- }
+ m = *r;
+ *r = m_split(m, len + padding, M_WAITOK);
+ *rs -= len + padding;
if (request->ip_data_mbuf == NULL)
request->ip_data_mbuf = m;
@@ -505,7 +427,6 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
m_cat(request->ip_data_mbuf, m);
request->ip_data_len += len;
- *availablep -= len + padding;
} else
ICL_DEBUG("len 0");
@@ -517,7 +438,7 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
}
static int
-icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
+icl_pdu_check_data_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs)
{
uint32_t received_digest, valid_digest;
@@ -528,12 +449,7 @@ icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
return (0);
CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
- if (icl_conn_receive_buf(request->ip_conn,
- &received_digest, ISCSI_DATA_DIGEST_SIZE)) {
- ICL_DEBUG("failed to receive data digest");
- return (-1);
- }
- *availablep -= ISCSI_DATA_DIGEST_SIZE;
+ icl_soft_receive_buf(r, rs, &received_digest, ISCSI_DATA_DIGEST_SIZE);
/*
* Note that ip_data_mbuf also contains padding; since digest
@@ -555,16 +471,13 @@ icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
* "part" of PDU at a time; call it repeatedly until it returns non-NULL.
*/
static struct icl_pdu *
-icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
+icl_conn_receive_pdu(struct icl_conn *ic, struct mbuf **r, size_t *rs)
{
struct icl_pdu *request;
- struct socket *so;
size_t len;
- int error;
+ int error = 0;
bool more_needed;
- so = ic->ic_socket;
-
if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
KASSERT(ic->ic_receive_pdu == NULL,
("ic->ic_receive_pdu != NULL"));
@@ -582,23 +495,11 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
request = ic->ic_receive_pdu;
}
- if (*availablep < ic->ic_receive_len) {
-#if 0
- ICL_DEBUG("not enough data; need %zd, "
- "have %zd", ic->ic_receive_len, *availablep);
-#endif
- return (NULL);
- }
-
switch (ic->ic_receive_state) {
case ICL_CONN_STATE_BHS:
//ICL_DEBUG("receiving BHS");
- error = icl_pdu_receive_bhs(request, availablep);
- if (error != 0) {
- ICL_DEBUG("failed to receive BHS; "
- "dropping connection");
- break;
- }
+ icl_soft_receive_buf(r, rs, request->ip_bhs,
+ sizeof(struct iscsi_bhs));
/*
* We don't enforce any limit for AHS length;
@@ -622,12 +523,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
case ICL_CONN_STATE_AHS:
//ICL_DEBUG("receiving AHS");
- error = icl_pdu_receive_ahs(request, availablep);
- if (error != 0) {
- ICL_DEBUG("failed to receive AHS; "
- "dropping connection");
- break;
- }
+ icl_pdu_receive_ahs(request, r, rs);
ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
if (ic->ic_header_crc32c == false)
ic->ic_receive_len = 0;
@@ -637,7 +533,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
case ICL_CONN_STATE_HEADER_DIGEST:
//ICL_DEBUG("receiving header digest");
- error = icl_pdu_check_header_digest(request, availablep);
+ error = icl_pdu_check_header_digest(request, r, rs);
if (error != 0) {
ICL_DEBUG("header digest failed; "
"dropping connection");
@@ -651,7 +547,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
case ICL_CONN_STATE_DATA:
//ICL_DEBUG("receiving data segment");
- error = icl_pdu_receive_data_segment(request, availablep,
+ error = icl_pdu_receive_data_segment(request, r, rs,
&more_needed);
if (error != 0) {
ICL_DEBUG("failed to receive data segment;"
@@ -671,7 +567,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
case ICL_CONN_STATE_DATA_DIGEST:
//ICL_DEBUG("receiving data digest");
- error = icl_pdu_check_data_digest(request, availablep);
+ error = icl_pdu_check_data_digest(request, r, rs);
if (error != 0) {
ICL_DEBUG("data digest failed; "
"dropping connection");
@@ -703,44 +599,27 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
}
static void
-icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
+icl_conn_receive_pdus(struct icl_conn *ic, struct mbuf **r, size_t *rs)
{
struct icl_pdu *response;
- struct socket *so;
-
- so = ic->ic_socket;
-
- /*
- * This can never happen; we're careful to only mess with ic->ic_socket
- * pointer when the send/receive threads are not running.
- */
- KASSERT(so != NULL, ("NULL socket"));
for (;;) {
if (ic->ic_disconnecting)
return;
- if (so->so_error != 0) {
- ICL_DEBUG("connection error %d; "
- "dropping connection", so->so_error);
- icl_conn_fail(ic);
- return;
- }
-
/*
* Loop until we have a complete PDU or there is not enough
* data in the socket buffer.
*/
- if (available < ic->ic_receive_len) {
+ if (*rs < ic->ic_receive_len) {
#if 0
- ICL_DEBUG("not enough data; have %zd, "
- "need %zd", available,
- ic->ic_receive_len);
+ ICL_DEBUG("not enough data; have %zd, need %zd",
+ *rs, ic->ic_receive_len);
#endif
return;
}
- response = icl_conn_receive_pdu(ic, &available);
+ response = icl_conn_receive_pdu(ic, r, rs);
if (response == NULL)
continue;
@@ -761,15 +640,19 @@ static void
icl_receive_thread(void *arg)
{
struct icl_conn *ic;
- size_t available;
+ size_t available, read = 0;
struct socket *so;
+ struct mbuf *m, *r = NULL;
+ struct uio uio;
+ int error, flags;
ic = arg;
so = ic->ic_socket;
for (;;) {
+ SOCKBUF_LOCK(&so->so_rcv);
if (ic->ic_disconnecting) {
- //ICL_DEBUG("terminating");
+ SOCKBUF_UNLOCK(&so->so_rcv);
break;
}
@@ -779,18 +662,50 @@ icl_receive_thread(void *arg)
* to avoid unnecessary wakeups until there
* is enough data received to read the PDU.
*/
- SOCKBUF_LOCK(&so->so_rcv);
available = sbavail(&so->so_rcv);
- if (available < ic->ic_receive_len) {
- so->so_rcv.sb_lowat = ic->ic_receive_len;
+ if (read + available < ic->ic_receive_len) {
+ so->so_rcv.sb_lowat = ic->ic_receive_len - read;
cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
- } else
so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
+ available = sbavail(&so->so_rcv);
+ }
SOCKBUF_UNLOCK(&so->so_rcv);
- icl_conn_receive_pdus(ic, available);
+ if (available == 0) {
+ if (so->so_error != 0) {
+ ICL_DEBUG("connection error %d; "
+ "dropping connection", so->so_error);
+ icl_conn_fail(ic);
+ break;
+ }
+ continue;
+ }
+
+ memset(&uio, 0, sizeof(uio));
+ uio.uio_resid = available;
+ flags = MSG_DONTWAIT;
+ error = soreceive(so, NULL, &uio, &m, NULL, &flags);
+ if (error != 0) {
+ ICL_DEBUG("soreceive error %d", error);
+ break;
+ }
+ if (uio.uio_resid != 0) {
+ m_freem(m);
+ ICL_DEBUG("short read");
+ break;
+ }
+ if (r)
+ m_cat(r, m);
+ else
+ r = m;
+ read += available;
+
+ icl_conn_receive_pdus(ic, &r, &read);
}
+ if (r)
+ m_freem(r);
+
ICL_CONN_LOCK(ic);
ic->ic_receive_running = false;
cv_signal(&ic->ic_send_cv);
@@ -1366,12 +1281,17 @@ icl_soft_conn_close(struct icl_conn *ic)
struct icl_pdu *pdu;
struct socket *so;
- ICL_CONN_LOCK(ic);
-
/*
* Wake up the threads, so they can properly terminate.
+ * Receive thread sleeps on so->so_rcv lock, send on ic->ic_lock.
*/
- ic->ic_disconnecting = true;
+ ICL_CONN_LOCK(ic);
+ if (!ic->ic_disconnecting) {
+ so = ic->ic_socket;
+ SOCKBUF_LOCK(&so->so_rcv);
+ ic->ic_disconnecting = true;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ }
while (ic->ic_receive_running || ic->ic_send_running) {
cv_signal(&ic->ic_receive_cv);
cv_signal(&ic->ic_send_cv);
More information about the dev-commits-src-all
mailing list