git: cd9618bdb274 - main - bhyve: Snapshot impovements for 'blockif' backend

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Thu, 23 Jun 2022 18:46:56 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=cd9618bdb274375139080ee4e33ccbdc980513f3

commit cd9618bdb274375139080ee4e33ccbdc980513f3
Author:     Vitaliy Gusev <gusev.vitaliy@gmail.com>
AuthorDate: 2022-06-23 18:46:06 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2022-06-23 18:46:06 +0000

    bhyve: Snapshot impovements for 'blockif' backend
    
    When pausing a block I/O device model as part of suspending a VM, wait
    for all active block I/O requests to finish before saving snapshot
    data.  This avoids having to save information about in-flight requests
    both in the block_if layer and in storage device models.
    
    For the AHCI device model, the queues are now guaranteed to be idle
    when taking a snapshot, so remove the code to save queue state and
    rely on the initial state in a resumed VM having all queues already
    idle.
    
    This will also simplify adding NVMe snapshot support in the future.
    
    Reviewed by:    jhb
    Sponsored by:   vStack
    Differential Revision: https://reviews.freebsd.org/D26267
---
 usr.sbin/bhyve/block_if.c | 105 ++++------------------------------
 usr.sbin/bhyve/block_if.h |   4 --
 usr.sbin/bhyve/pci_ahci.c | 142 +---------------------------------------------
 3 files changed, 15 insertions(+), 236 deletions(-)

diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index 06c3cb50fb13..05c9d9731754 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -109,11 +109,9 @@ struct blockif_ctxt {
 	int			bc_psectoff;
 	int			bc_closing;
 	int			bc_paused;
-	int			bc_work_count;
 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
 	pthread_mutex_t		bc_mtx;
 	pthread_cond_t		bc_cond;
-	pthread_cond_t		bc_paused_cond;
 	pthread_cond_t		bc_work_done_cond;
 	blockif_resize_cb	*bc_resize_cb;
 	void			*bc_resize_cb_arg;
@@ -362,6 +360,12 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
 	(*br->br_callback)(br, err);
 }
 
+static inline bool
+blockif_empty(const struct blockif_ctxt *bc)
+{
+	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
+}
+
 static void *
 blockif_thr(void *arg)
 {
@@ -379,30 +383,21 @@ blockif_thr(void *arg)
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	for (;;) {
-		bc->bc_work_count++;
-
-		/* We cannot process work if the interface is paused */
-		while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) {
+		while (blockif_dequeue(bc, t, &be)) {
 			pthread_mutex_unlock(&bc->bc_mtx);
 			blockif_proc(bc, be, buf);
 			pthread_mutex_lock(&bc->bc_mtx);
 			blockif_complete(bc, be);
 		}
 
-		bc->bc_work_count--;
-
-		/* If none of the workers are busy, notify the main thread */
-		if (bc->bc_work_count == 0)
+		/* If none to work, notify the main thread */
+		if (blockif_empty(bc))
 			pthread_cond_broadcast(&bc->bc_work_done_cond);
 
 		/* Check ctxt status here to see if exit requested */
 		if (bc->bc_closing)
 			break;
 
-		/* Make all worker threads wait here if the device is paused */
-		while (bc->bc_paused)
-			pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx);
-
 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
@@ -638,8 +633,6 @@ blockif_open(nvlist_t *nvl, const char *ident)
 	pthread_mutex_init(&bc->bc_mtx, NULL);
 	pthread_cond_init(&bc->bc_cond, NULL);
 	bc->bc_paused = 0;
-	bc->bc_work_count = 0;
-	pthread_cond_init(&bc->bc_paused_cond, NULL);
 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
 	TAILQ_INIT(&bc->bc_freeq);
 	TAILQ_INIT(&bc->bc_pendq);
@@ -737,6 +730,7 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 	err = 0;
 
 	pthread_mutex_lock(&bc->bc_mtx);
+	assert(!bc->bc_paused);
 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
 		/*
 		 * Enqueue and inform the block i/o thread
@@ -761,7 +755,6 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 int
 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_READ));
 }
@@ -769,7 +762,6 @@ blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
 int
 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_WRITE));
 }
@@ -777,7 +769,6 @@ blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
 int
 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_FLUSH));
 }
@@ -785,7 +776,6 @@ blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 int
 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_DELETE));
 }
@@ -955,7 +945,6 @@ blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
 off_t
 blockif_size(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_size);
 }
@@ -963,7 +952,6 @@ blockif_size(struct blockif_ctxt *bc)
 int
 blockif_sectsz(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_sectsz);
 }
@@ -971,7 +959,6 @@ blockif_sectsz(struct blockif_ctxt *bc)
 void
 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	*size = bc->bc_psectsz;
 	*off = bc->bc_psectoff;
@@ -980,7 +967,6 @@ blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
 int
 blockif_queuesz(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (BLOCKIF_MAXREQ - 1);
 }
@@ -988,7 +974,6 @@ blockif_queuesz(struct blockif_ctxt *bc)
 int
 blockif_is_ro(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_rdonly);
 }
@@ -996,7 +981,6 @@ blockif_is_ro(struct blockif_ctxt *bc)
 int
 blockif_candelete(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_candelete);
 }
@@ -1012,7 +996,7 @@ blockif_pause(struct blockif_ctxt *bc)
 	bc->bc_paused = 1;
 
 	/* The interface is paused. Wait for workers to finish their work */
-	while (bc->bc_work_count)
+	while (!blockif_empty(bc))
 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
 	pthread_mutex_unlock(&bc->bc_mtx);
 
@@ -1029,71 +1013,6 @@ blockif_resume(struct blockif_ctxt *bc)
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	bc->bc_paused = 0;
-	/* resume the threads waiting for paused */
-	pthread_cond_broadcast(&bc->bc_paused_cond);
-	/* kick the threads after restore */
-	pthread_cond_broadcast(&bc->bc_cond);
-	pthread_mutex_unlock(&bc->bc_mtx);
-}
-
-int
-blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta)
-{
-	int i;
-	struct iovec *iov;
-	int ret;
-
-	SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done);
-
-	/*
-	 * XXX: The callback and parameter must be filled by the virtualized
-	 * device that uses the interface, during its init; we're not touching
-	 * them here.
-	 */
-
-	/* Snapshot the iovecs. */
-	for (i = 0; i < br->br_iovcnt; i++) {
-		iov = &br->br_iov[i];
-
-		SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done);
-
-		/* We assume the iov is a guest-mapped address. */
-		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len,
-			false, meta, ret, done);
-	}
-
-done:
-	return (ret);
-}
-
-int
-blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta)
-{
-	int ret;
-
-	if (bc->bc_paused == 0) {
-		fprintf(stderr, "%s: Snapshot failed: "
-			"interface not paused.\r\n", __func__);
-		return (ENXIO);
-	}
-
-	pthread_mutex_lock(&bc->bc_mtx);
-
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done);
-
-done:
 	pthread_mutex_unlock(&bc->bc_mtx);
-	return (ret);
 }
-#endif
+#endif	/* BHYVE_SNAPSHOT */
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index 0407ff43cf94..cf3acc05c98f 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -87,10 +87,6 @@ int	blockif_close(struct blockif_ctxt *bc);
 #ifdef BHYVE_SNAPSHOT
 void	blockif_pause(struct blockif_ctxt *bc);
 void	blockif_resume(struct blockif_ctxt *bc);
-int	blockif_snapshot_req(struct blockif_req *br,
-    struct vm_snapshot_meta *meta);
-int	blockif_snapshot(struct blockif_ctxt *bc,
-    struct vm_snapshot_meta *meta);
 #endif
 
 #endif /* _BLOCK_IF_H_ */
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index d07b1f085e3d..ba8921ea3b6f 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -2562,114 +2562,14 @@ open_fail:
 }
 
 #ifdef BHYVE_SNAPSHOT
-static int
-pci_ahci_snapshot_save_queues(struct ahci_port *port,
-			      struct vm_snapshot_meta *meta)
-{
-	int ret;
-	int idx;
-	struct ahci_ioreq *ioreq;
-
-	STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) {
-		idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
-		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-	}
-
-	idx = -1;
-	SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-
-	TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) {
-		idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
-		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-
-		/*
-		 * Snapshot only the busy requests; other requests are
-		 * not valid.
-		 */
-		ret = blockif_snapshot_req(&ioreq->io_req, meta);
-		if (ret != 0) {
-			fprintf(stderr, "%s: failed to snapshot req\r\n",
-				__func__);
-			goto done;
-		}
-	}
-
-	idx = -1;
-	SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-
-done:
-	return (ret);
-}
-
-static int
-pci_ahci_snapshot_restore_queues(struct ahci_port *port,
-				 struct vm_snapshot_meta *meta)
-{
-	int ret;
-	int idx;
-	struct ahci_ioreq *ioreq;
-
-	/* Empty the free queue before restoring. */
-	while (!STAILQ_EMPTY(&port->iofhd))
-		STAILQ_REMOVE_HEAD(&port->iofhd, io_flist);
-
-	/* Restore the free queue. */
-	while (1) {
-		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-		if (idx == -1)
-			break;
-
-		STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist);
-	}
-
-	/* Restore the busy queue. */
-	while (1) {
-		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-		if (idx == -1)
-			break;
-
-		ioreq = &port->ioreq[idx];
-		TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist);
-
-		/*
-		 * Restore only the busy requests; other requests are
-		 * not valid.
-		 */
-		ret = blockif_snapshot_req(&ioreq->io_req, meta);
-		if (ret != 0) {
-			fprintf(stderr, "%s: failed to restore request\r\n",
-				__func__);
-			goto done;
-		}
-
-		/* Re-enqueue the requests in the block interface. */
-		if (ioreq->readop)
-			ret = blockif_read(port->bctx, &ioreq->io_req);
-		else
-			ret = blockif_write(port->bctx, &ioreq->io_req);
-
-		if (ret != 0) {
-			fprintf(stderr,
-				"%s: failed to re-enqueue request\r\n",
-				__func__);
-			goto done;
-		}
-	}
-
-done:
-	return (ret);
-}
-
 static int
 pci_ahci_snapshot(struct vm_snapshot_meta *meta)
 {
-	int i, j, ret;
+	int i, ret;
 	void *bctx;
 	struct pci_devinst *pi;
 	struct pci_ahci_softc *sc;
 	struct ahci_port *port;
-	struct ahci_cmd_hdr *hdr;
-	struct ahci_ioreq *ioreq;
 
 	pi = meta->dev_data;
 	sc = pi->pi_arg;
@@ -2753,43 +2653,7 @@ pci_ahci_snapshot(struct vm_snapshot_meta *meta)
 		SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done);
 
-		for (j = 0; j < port->ioqsz; j++) {
-			ioreq = &port->ioreq[j];
-
-			/* blockif_req snapshot done only for busy requests. */
-			hdr = (struct ahci_cmd_hdr *)(port->cmd_lst +
-				ioreq->slot * AHCI_CL_SIZE);
-			SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis,
-				0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry),
-				false, meta, ret, done);
-
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done);
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done);
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done);
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done);
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done);
-		}
-
-		/* Perform save / restore specific operations. */
-		if (meta->op == VM_SNAPSHOT_SAVE) {
-			ret = pci_ahci_snapshot_save_queues(port, meta);
-			if (ret != 0)
-				goto done;
-		} else if (meta->op == VM_SNAPSHOT_RESTORE) {
-			ret = pci_ahci_snapshot_restore_queues(port, meta);
-			if (ret != 0)
-				goto done;
-		} else {
-			ret = EINVAL;
-			goto done;
-		}
-
-		ret = blockif_snapshot(port->bctx, meta);
-		if (ret != 0) {
-			fprintf(stderr, "%s: failed to restore blockif\r\n",
-				__func__);
-			goto done;
-		}
+		assert(TAILQ_EMPTY(&port->iobhd));
 	}
 
 done:
@@ -2835,7 +2699,7 @@ pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi)
 
 	return (0);
 }
-#endif
+#endif	/* BHYVE_SNAPSHOT */
 
 /*
  * Use separate emulation names to distinguish drive and atapi devices