git: 460ed6106cf0 - main - Add support for managing UFS/FFS snapshots to fsck_ffs(8).

From: Kirk McKusick <mckusick_at_FreeBSD.org>
Date: Wed, 09 Nov 2022 18:46:51 UTC
The branch main has been updated by mckusick:

URL: https://cgit.FreeBSD.org/src/commit/?id=460ed6106cf0854caff62e4eeba8ffcd00ab0690

commit 460ed6106cf0854caff62e4eeba8ffcd00ab0690
Author:     Kirk McKusick <mckusick@FreeBSD.org>
AuthorDate: 2022-11-09 18:44:03 +0000
Commit:     Kirk McKusick <mckusick@FreeBSD.org>
CommitDate: 2022-11-09 18:46:31 +0000

    Add support for managing UFS/FFS snapshots to fsck_ffs(8).
    
    The kernel handles the managment of UFS/FFS snapshots. Since UFS/FFS
    updates filesystem data (rather than always writing changes to new
    locations like ZFS), the kernel must check every filesystem write
    to see if the block being written is part of a snapshot. If it is
    part of a snapshot, then the kernel must make a copy of the old
    block value into a newly allocated block for the snapshot before
    allowing the write to be done. Similarly, if a block is being freed,
    the kernel must check to see if it is part of a snapshot and let
    the snapshot claim the block rather than freeing it for future use.
    When a snapshot is freed, its blocks need to be offered to older
    snapshots and freed only if no older snapshots wish to claim them.
    
    When snapshots were added to UFS/FFS they were integrated into soft
    updates and just a small part of the management of snapshots needed
    to be added to fsck_ffs(8) as soft updates minimized the set of
    snapshot changes that might need correction. When journaling was
    added to soft updates a much more complete knowledge of snapshots
    needed to be added to fsck_ffs(8) for it to be able to properly
    handle the filesystem changes that a journal rollback needs to do
    (specifically the freeing and allocation of blocks). Since this
    functionality was unavailable, the use of snapshots was disabled
    when running with journaled soft updates.
    
    This set of changes imports the kernel code for the management of
    snapshots to fsck_ffs(8). With this code in place it will become
    possible to enable snapshots when running with journalled soft
    updates. The most immediate benefit will be the ability to use
    snapshots to take consistent filesystem dumps on live filesystems.
    Future work will be done to update fsck_ffs(8) to be able to use
    snapshots to run in background on live filesystems running with
    journaled soft updates.
    
    Reviewed by:  kib
    Tested by:    Peter Holm
    Sponsored by: The FreeBSD Foundation
    Differential Revision: https://reviews.freebsd.org/D36491
---
 sbin/fsck_ffs/dir.c    |  21 ++-
 sbin/fsck_ffs/fsck.h   |  29 +++-
 sbin/fsck_ffs/fsutil.c | 234 +++++++++++++++++---------
 sbin/fsck_ffs/inode.c  | 433 ++++++++++++++++++++++++++++++++++++++++++++++++-
 sbin/fsck_ffs/main.c   |   1 +
 sbin/fsck_ffs/setup.c  | 182 ++++++++++++++++++++-
 sbin/fsck_ffs/suj.c    |  96 +++++++++--
 7 files changed, 891 insertions(+), 105 deletions(-)

diff --git a/sbin/fsck_ffs/dir.c b/sbin/fsck_ffs/dir.c
index ba286a965513..d09e6940f812 100644
--- a/sbin/fsck_ffs/dir.c
+++ b/sbin/fsck_ffs/dir.c
@@ -679,14 +679,17 @@ expanddir(struct inode *ip, char *name)
 	struct bufarea *bp, *nbp;
 	struct inodesc idesc;
 	union dinode *dp;
-	int indiralloced;
+	long cg, indiralloced;
 	char *cp;
 
 	nbp = NULL;
 	indiralloced = newblk = indirblk = 0;
+	memset(&idesc, 0, sizeof(struct inodesc));
+	idesc.id_type = ADDR;
 	pwarn("NO SPACE LEFT IN %s", name);
 	if (!preen && reply("EXPAND") == 0)
 		return (0);
+	cg = ino_to_cg(&sblock, ip->i_number);
 	dp = ip->i_dp;
 	filesize = DIP(dp, di_size);
 	lastlbn = lblkno(&sblock, filesize);
@@ -705,7 +708,8 @@ expanddir(struct inode *ip, char *name)
 		bp = getdirblk(oldblk, lastlbnsize);
 		if (bp->b_errs)
 			goto bad;
-		if ((newblk = allocblk(sblock.fs_frag)) == 0)
+		newblk = allocblk(cg, sblock.fs_frag, std_checkblkavail);
+		if (newblk == 0)
 			goto bad;
 		nbp = getdatablk(newblk, sblock.fs_bsize, BT_DIRDATA);
 		if (nbp->b_errs)
@@ -724,6 +728,7 @@ expanddir(struct inode *ip, char *name)
 			memmove(cp, &emptydir, sizeof emptydir);
 		dirty(nbp);
 		brelse(nbp);
+		binval(bp);
 		idesc.id_blkno = oldblk;
 		idesc.id_numfrags = numfrags(&sblock, lastlbnsize);
 		(void)freeblock(&idesc);
@@ -731,7 +736,7 @@ expanddir(struct inode *ip, char *name)
 			printf(" (EXPANDED)\n");
 		return (1);
 	}
-	if ((newblk = allocblk(sblock.fs_frag)) == 0)
+	if ((newblk = allocblk(cg, sblock.fs_frag, std_checkblkavail)) == 0)
 		goto bad;
 	bp = getdirblk(newblk, sblock.fs_bsize);
 	if (bp->b_errs)
@@ -749,8 +754,12 @@ expanddir(struct inode *ip, char *name)
 		 * Allocate indirect block if needed.
 		 */
 		if ((indirblk = DIP(dp, di_ib[0])) == 0) {
-			if ((indirblk = allocblk(sblock.fs_frag)) == 0)
+			indirblk = allocblk(cg, sblock.fs_frag,
+			    std_checkblkavail);
+			if (indirblk == 0) {
+				binval(bp);
 				goto bad;
+			}
 			indiralloced = 1;
 		}
 		nbp = getdatablk(indirblk, sblock.fs_bsize, BT_LEVEL1);
@@ -774,8 +783,10 @@ expanddir(struct inode *ip, char *name)
 	return (1);
 bad:
 	pfatal(" (EXPANSION FAILED)\n");
-	if (nbp != NULL)
+	if (nbp != NULL) {
+		binval(bp);
 		brelse(nbp);
+	}
 	if (newblk != 0) {
 		idesc.id_blkno = newblk;
 		idesc.id_numfrags = sblock.fs_frag;
diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h
index d1b45a0850da..a00fedd9ef90 100644
--- a/sbin/fsck_ffs/fsck.h
+++ b/sbin/fsck_ffs/fsck.h
@@ -200,8 +200,7 @@ struct bufarea {
 #define	BT_INODES 	 7	/* Buffer holds inodes */
 #define	BT_DIRDATA 	 8	/* Buffer holds directory data */
 #define	BT_DATA	 	 9	/* Buffer holds user data */
-#define	BT_EMPTY 	10	/* Buffer allocated but not filled */
-#define BT_NUMBUFTYPES	11
+#define BT_NUMBUFTYPES	10
 #define BT_NAMES {			\
 	"unknown",			\
 	"Superblock",			\
@@ -212,8 +211,7 @@ struct bufarea {
 	"External Attribute",		\
 	"Inode Block",			\
 	"Directory Contents",		\
-	"User Data",			\
-	"Allocated but not filled" }
+	"User Data" }
 extern char *buftype[];
 #define BT_BUFTYPE(type) \
 	type < BT_NUMBUFTYPES ? buftype[type] : buftype[BT_UNKNOWN]
@@ -234,7 +232,7 @@ extern struct bufarea *pdirbp;		/* current directory contents */
 		(bp)->b_flags |= B_DIRTY; \
 } while (0)
 #define	initbarea(bp, type) do { \
-	(bp)->b_bno = (ufs2_daddr_t)-1; \
+	(bp)->b_bno = (ufs2_daddr_t)-4; \
 	(bp)->b_size = 0; \
 	(bp)->b_errs = 0; \
 	(bp)->b_flags = 0; \
@@ -347,6 +345,7 @@ extern char *blockmap;		/* ptr to primary blk allocation map */
 extern char *cdevname;		/* name of device being checked */
 extern char ckclean;		/* only do work if not cleanly unmounted */
 extern int ckhashadd;		/* check hashes to be added */
+extern char *copybuf;		/* buffer to copy snapshot blocks */
 extern int cvtlevel;		/* convert to newer file system format */
 extern long dev_bsize;		/* computed value of DEV_BSIZE */
 extern u_int real_dev_bsize;	/* actual disk sector size, not overridden */
@@ -371,6 +370,8 @@ extern char resolved;		/* cleared if unresolved changes => not clean */
 extern int returntosingle;	/* 1 => return to single user mode on exit */
 extern long secsize;		/* actual disk sector size */
 extern char skipclean;		/* skip clean file systems if preening */
+extern int snapcnt;		/* number of active snapshots */
+extern struct inode snaplist[FSMAXSNAP + 1]; /* list of active snapshots */
 extern char snapname[BUFSIZ];	/* when doing snapshots, the name of the file */
 extern int sujrecovery;		/* 1 => doing check using the journal */
 extern int surrender;		/* Give up if reads fail */
@@ -441,9 +442,11 @@ struct fstab;
 
 void		adjust(struct inodesc *, int lcnt);
 void		alarmhandler(int sig);
-ufs2_daddr_t	allocblk(long frags);
+ufs2_daddr_t	allocblk(long cg, long frags, ufs2_daddr_t (*checkblkavail)
+		    (ufs2_daddr_t blkno, long frags));
 ino_t		allocdir(ino_t parent, ino_t request, int mode);
 ino_t		allocino(ino_t request, int type);
+void		binval(struct bufarea *);
 void		blkerror(ino_t ino, const char *type, ufs2_daddr_t blk);
 char	       *blockcheck(char *name);
 int		blread(int fd, char *buf, ufs2_daddr_t blk, long size);
@@ -458,12 +461,15 @@ void		catchquit(int);
 void		cgdirty(struct bufarea *);
 struct bufarea *cglookup(int cg);
 int		changeino(ino_t dir, const char *name, ino_t newnum);
+void		check_blkcnt(struct inode *ip);
 int		check_cgmagic(int cg, struct bufarea *cgbp, int requestrebuild);
 int		chkrange(ufs2_daddr_t blk, int cnt);
 void		ckfini(int markclean);
 int		ckinode(union dinode *dp, struct inodesc *);
 void		clri(struct inodesc *, const char *type, int flag);
 int		clearentry(struct inodesc *);
+void		copyonwrite(struct fs *, struct bufarea *,
+		    ufs2_daddr_t (*checkblkavail)(long, long));
 void		direrror(ino_t ino, const char *errmesg);
 int		dirscan(struct inodesc *);
 int		dofix(struct inodesc *, const char *msg);
@@ -476,6 +482,7 @@ void		flush(int fd, struct bufarea *bp);
 int		freeblock(struct inodesc *);
 void		freeino(ino_t ino);
 void		freeinodebuf(void);
+void		fsckinit(void);
 void		fsutilinit(void);
 int		ftypeok(union dinode *dp);
 void		getblk(struct bufarea *bp, ufs2_daddr_t blk, long size);
@@ -484,6 +491,7 @@ struct inoinfo *getinoinfo(ino_t inumber);
 union dinode   *getnextinode(ino_t inumber, int rebuildcg);
 void		getpathname(char *namebuf, ino_t curdir, ino_t ino);
 void		ginode(ino_t, struct inode *);
+void		gjournal_check(const char *filesys);
 void		infohandler(int sig);
 void		irelse(struct inode *);
 ufs2_daddr_t	ino_blkatoff(union dinode *, ino_t, ufs_lbn_t, int *,
@@ -505,6 +513,7 @@ void		pass4(void);
 void		pass5(void);
 void		pfatal(const char *fmt, ...) __printflike(1, 2);
 void		propagate(void);
+void		prtbuf(struct bufarea *, const char *, ...) __printflike(2, 3);
 void		prtinode(struct inode *);
 void		pwarn(const char *fmt, ...) __printflike(1, 2);
 int		readsb(void);
@@ -513,9 +522,13 @@ void		rwerror(const char *mesg, ufs2_daddr_t blk);
 void		sblock_init(void);
 void		setinodebuf(int, ino_t);
 int		setup(char *dev);
-void		gjournal_check(const char *filesys);
+int		snapblkfree(struct fs *, ufs2_daddr_t, long, ino_t,
+		    ufs2_daddr_t (*)(ufs2_daddr_t, long));
+void		snapremove(ino_t);
+void		snapflush(ufs2_daddr_t (*checkblkavail)(long, long));
+ufs2_daddr_t	std_checkblkavail(ufs2_daddr_t blkno, long frags);
+ufs2_daddr_t	suj_checkblkavail(ufs2_daddr_t, long);
 int		suj_check(const char *filesys);
 void		update_maps(struct cg *, struct cg*, int);
-void		fsckinit(void);
 
 #endif	/* !_FSCK_H_ */
diff --git a/sbin/fsck_ffs/fsutil.c b/sbin/fsck_ffs/fsutil.c
index f1834e1235d8..5be058dc40c2 100644
--- a/sbin/fsck_ffs/fsutil.c
+++ b/sbin/fsck_ffs/fsutil.c
@@ -71,7 +71,6 @@ static void cg_write(struct bufarea *);
 static void slowio_start(void);
 static void slowio_end(void);
 static void printIOstats(void);
-static void prtbuf(const char *, struct bufarea *);
 
 static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */
 static struct timespec startpass, finishpass;
@@ -79,8 +78,10 @@ struct timeval slowio_starttime;
 int slowio_delay_usec = 10000;	/* Initial IO delay for background fsck */
 int slowio_pollcnt;
 static struct bufarea cgblk;	/* backup buffer for cylinder group blocks */
+static struct bufarea failedbuf; /* returned by failed getdatablk() */
 static TAILQ_HEAD(bufqueue, bufarea) bufqueuehd; /* head of buffer cache LRU */
 static LIST_HEAD(bufhash, bufarea) bufhashhd[HASHSIZE]; /* buffer hash list */
+static struct bufhash freebufs;	/* unused buffers */
 static int numbufs;		/* size of buffer cache */
 static int cachelookups;	/* number of cache lookups */
 static int cachereads;		/* number of cache reads */
@@ -187,11 +188,15 @@ bufinit(void)
 {
 	int i;
 
+	initbarea(&failedbuf, BT_UNKNOWN);
+	failedbuf.b_errs = -1;
+	failedbuf.b_un.b_buf = NULL;
 	if ((cgblk.b_un.b_buf = Malloc((unsigned int)sblock.fs_bsize)) == NULL)
 		errx(EEXIT, "Initial malloc(%d) failed", sblock.fs_bsize);
 	initbarea(&cgblk, BT_CYLGRP);
 	numbufs = cachelookups = cachereads = 0;
 	TAILQ_INIT(&bufqueuehd);
+	LIST_INIT(&freebufs);
 	for (i = 0; i < HASHSIZE; i++)
 		LIST_INIT(&bufhashhd[i]);
 	for (i = 0; i < BT_NUMBUFTYPES; i++) {
@@ -300,7 +305,7 @@ flushentry(void)
 }
 
 /*
- * Manage a cache of directory blocks.
+ * Manage a cache of filesystem disk blocks.
  */
 struct bufarea *
 getdatablk(ufs2_daddr_t blkno, long size, int type)
@@ -309,19 +314,23 @@ getdatablk(ufs2_daddr_t blkno, long size, int type)
 	struct bufhash *bhdp;
 
 	cachelookups++;
-	/* If out of range, return empty buffer with b_err == -1 */
-	if (type != BT_INODES && chkrange(blkno, size / sblock.fs_fsize)) {
-		blkno = -1;
-		type = BT_EMPTY;
-	}
+	/*
+	 * If out of range, return empty buffer with b_err == -1
+	 *
+	 * Skip check for inodes because chkrange() considers
+	 * metadata areas invalid to write data.
+	 */
+	if (type != BT_INODES && chkrange(blkno, size / sblock.fs_fsize))
+		return (&failedbuf);
 	bhdp = &bufhashhd[HASH(blkno)];
 	LIST_FOREACH(bp, bhdp, b_hash)
 		if (bp->b_bno == fsbtodb(&sblock, blkno)) {
 			if (debug && bp->b_size != size) {
-				prtbuf("getdatablk: size mismatch", bp);
+				prtbuf(bp, "getdatablk: size mismatch");
 				pfatal("getdatablk: b_size %d != size %ld\n",
 				    bp->b_size, size);
 			}
+			TAILQ_REMOVE(&bufqueuehd, bp, b_list);
 			goto foundit;
 		}
 	/*
@@ -340,7 +349,9 @@ getdatablk(ufs2_daddr_t blkno, long size, int type)
 	if (size > sblock.fs_bsize)
 		errx(EEXIT, "Excessive buffer size %ld > %d\n", size,
 		    sblock.fs_bsize);
-	if (numbufs < MINBUFS) {
+	if ((bp = LIST_FIRST(&freebufs)) != NULL) {
+		LIST_REMOVE(bp, b_hash);
+	} else if (numbufs < MINBUFS) {
 		bp = allocbuf("cannot create minimal buffer pool");
 	} else if (sujrecovery) {
 		/*
@@ -368,6 +379,7 @@ getdatablk(ufs2_daddr_t blkno, long size, int type)
 		else
 			LIST_REMOVE(bp, b_hash);
 	}
+	TAILQ_REMOVE(&bufqueuehd, bp, b_list);
 	flush(fswritefd, bp);
 	bp->b_type = type;
 	LIST_INSERT_HEAD(bhdp, bp, b_hash);
@@ -375,13 +387,12 @@ getdatablk(ufs2_daddr_t blkno, long size, int type)
 	cachereads++;
 	/* fall through */
 foundit:
+	TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list);
 	if (debug && bp->b_type != type) {
 		printf("getdatablk: buffer type changed to %s",
 		    BT_BUFTYPE(type));
-		prtbuf("", bp);
+		prtbuf(bp, "");
 	}
-	TAILQ_REMOVE(&bufqueuehd, bp, b_list);
-	TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list);
 	if (bp->b_errs == 0)
 		bp->b_refcnt++;
 	return (bp);
@@ -401,11 +412,7 @@ getblk(struct bufarea *bp, ufs2_daddr_t blk, long size)
 			readcnt[bp->b_type]++;
 			clock_gettime(CLOCK_REALTIME_PRECISE, &start);
 		}
-		if (bp->b_type != BT_EMPTY)
-			bp->b_errs =
-			    blread(fsreadfd, bp->b_un.b_buf, dblk, size);
-		else
-			bp->b_errs = -1;
+		bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, dblk, size);
 		if (debug) {
 			clock_gettime(CLOCK_REALTIME_PRECISE, &finish);
 			timespecsub(&finish, &start, &finish);
@@ -422,10 +429,19 @@ brelse(struct bufarea *bp)
 {
 
 	if (bp->b_refcnt <= 0)
-		prtbuf("brelse: buffer with negative reference count", bp);
+		prtbuf(bp, "brelse: buffer with negative reference count");
 	bp->b_refcnt--;
 }
 
+void
+binval(struct bufarea *bp)
+{
+
+	bp->b_flags &= ~B_DIRTY;
+	LIST_REMOVE(bp, b_hash);
+	LIST_INSERT_HEAD(&freebufs, bp, b_hash);
+}
+
 void
 flush(int fd, struct bufarea *bp)
 {
@@ -451,10 +467,18 @@ flush(int fd, struct bufarea *bp)
 		if (bp != &sblk)
 			pfatal("BUFFER %p DOES NOT MATCH SBLK %p\n",
 			    bp, &sblk);
+		/*
+		 * Superblocks are always pre-copied so we do not need
+		 * to check them for copy-on-write.
+		 */
 		if (sbput(fd, bp->b_un.b_fs, 0) == 0)
 			fsmodified = 1;
 		break;
 	case BT_CYLGRP:
+		/*
+		 * Cylinder groups are always pre-copied so we do not
+		 * need to check them for copy-on-write.
+		 */
 		if (sujrecovery)
 			cg_write(bp);
 		if (cgput(fswritefd, &sblock, bp->b_un.b_cg) == 0)
@@ -483,11 +507,38 @@ flush(int fd, struct bufarea *bp)
 		}
 		/* FALLTHROUGH */
 	default:
+		copyonwrite(&sblock, bp, std_checkblkavail);
 		blwrite(fd, bp->b_un.b_buf, bp->b_bno, bp->b_size);
 		break;
 	}
 }
 
+/*
+ * If there are any snapshots, ensure that all the blocks that they
+ * care about have been copied, then release the snapshot inodes.
+ * These operations need to be done before we rebuild the cylinder
+ * groups so that any block allocations are properly recorded.
+ * Since all the cylinder group maps have already been copied in
+ * the snapshots, no further snapshot copies will need to be done.
+ */
+void
+snapflush(ufs2_daddr_t (*checkblkavail)(long, long))
+{
+	struct bufarea *bp;
+	int cnt;
+
+	if (snapcnt > 0) {
+		if (debug)
+			printf("Check for snapshot copies\n");
+		TAILQ_FOREACH_REVERSE(bp, &bufqueuehd, bufqueue, b_list)
+			if ((bp->b_flags & B_DIRTY) != 0)
+				copyonwrite(&sblock, bp, checkblkavail);
+		for (cnt = 0; cnt < snapcnt; cnt++)
+			irelse(&snaplist[cnt]);
+		snapcnt = 0;
+	}
+}
+
 /*
  * Journaled soft updates does not maintain cylinder group summary
  * information during cleanup, so this routine recalculates the summary
@@ -499,6 +550,7 @@ cg_write(struct bufarea *bp)
 {
 	ufs1_daddr_t fragno, cgbno, maxbno;
 	u_int8_t *blksfree;
+	struct csum *csp;
 	struct cg *cgp;
 	int blk;
 	int i;
@@ -536,6 +588,11 @@ cg_write(struct bufarea *bp)
 	 * Update the superblock cg summary from our now correct values
 	 * before writing the block.
 	 */
+	csp = &sblock.fs_cs(&sblock, cgp->cg_cgx);
+	sblock.fs_cstotal.cs_ndir += cgp->cg_cs.cs_ndir - csp->cs_ndir;
+	sblock.fs_cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree - csp->cs_nbfree;
+	sblock.fs_cstotal.cs_nifree += cgp->cg_cs.cs_nifree - csp->cs_nifree;
+	sblock.fs_cstotal.cs_nffree += cgp->cg_cs.cs_nffree - csp->cs_nffree;
 	sblock.fs_cs(&sblock, cgp->cg_cgx) = cgp->cg_cs;
 }
 
@@ -587,6 +644,7 @@ ckfini(int markclean)
 		(void)close(fsreadfd);
 		return;
 	}
+
 	/*
 	 * To remain idempotent with partial truncations the buffers
 	 * must be flushed in this order:
@@ -629,14 +687,9 @@ ckfini(int markclean)
 		case BT_SUPERBLK:
 		case BT_CYLGRP:
 		default:
-			prtbuf("ckfini: improper buffer type on cache list",bp);
+			prtbuf(bp,"ckfini: improper buffer type on cache list");
 			continue;
 		/* These are the ones to flush in this step */
-		case BT_EMPTY:
-			if (bp->b_bno >= 0)
-				pfatal("Unused BT_EMPTY buffer for block %jd\n",
-				    (intmax_t)bp->b_bno);
-			/* FALLTHROUGH */
 		case BT_LEVEL1:
 		case BT_LEVEL2:
 		case BT_LEVEL3:
@@ -648,11 +701,10 @@ ckfini(int markclean)
 		case BT_INODES:
 			continue;
 		}
-		if (debug && bp->b_refcnt != 0) {
-			prtbuf("ckfini: clearing in-use buffer", bp);
-			pfatal("ckfini: clearing in-use buffer\n");
-		}
+		if (debug && bp->b_refcnt != 0)
+			prtbuf(bp, "ckfini: clearing in-use buffer");
 		TAILQ_REMOVE(&bufqueuehd, bp, b_list);
+		LIST_REMOVE(bp, b_hash);
 		cnt++;
 		flush(fswritefd, bp);
 		free(bp->b_un.b_buf);
@@ -666,11 +718,10 @@ ckfini(int markclean)
 		icachebp = NULL;
 	}
 	TAILQ_FOREACH_REVERSE_SAFE(bp, &bufqueuehd, bufqueue, b_list, nbp) {
-		if (debug && bp->b_refcnt != 0) {
-			prtbuf("ckfini: clearing in-use buffer", bp);
-			pfatal("ckfini: clearing in-use buffer\n");
-		}
+		if (debug && bp->b_refcnt != 0)
+			prtbuf(bp, "ckfini: clearing in-use buffer");
 		TAILQ_REMOVE(&bufqueuehd, bp, b_list);
+		LIST_REMOVE(bp, b_hash);
 		cnt++;
 		flush(fswritefd, bp);
 		free(bp->b_un.b_buf);
@@ -1050,45 +1101,77 @@ check_cgmagic(int cg, struct bufarea *cgbp, int request_rebuild)
  * allocate a data block with the specified number of fragments
  */
 ufs2_daddr_t
-allocblk(long frags)
+allocblk(long startcg, long frags,
+    ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags))
 {
-	int i, j, k, cg, baseblk;
-	struct bufarea *cgbp;
-	struct cg *cgp;
+	ufs2_daddr_t blkno, newblk;
 
+	if (sujrecovery && checkblkavail == std_checkblkavail) {
+		pfatal("allocblk: std_checkblkavail used for SUJ recovery\n");
+		return (0);
+	}
 	if (frags <= 0 || frags > sblock.fs_frag)
 		return (0);
-	for (i = 0; i < maxfsblock - sblock.fs_frag; i += sblock.fs_frag) {
-		for (j = 0; j <= sblock.fs_frag - frags; j++) {
-			if (testbmap(i + j))
-				continue;
-			for (k = 1; k < frags; k++)
-				if (testbmap(i + j + k))
-					break;
-			if (k < frags) {
-				j += k;
-				continue;
-			}
-			cg = dtog(&sblock, i + j);
-			cgbp = cglookup(cg);
-			cgp = cgbp->b_un.b_cg;
-			if (!check_cgmagic(cg, cgbp, 0)) {
-				i = (cg + 1) * sblock.fs_fpg - sblock.fs_frag;
-				continue;
-			}
-			baseblk = dtogd(&sblock, i + j);
-			for (k = 0; k < frags; k++) {
-				setbmap(i + j + k);
-				clrbit(cg_blksfree(cgp), baseblk + k);
-			}
-			n_blks += frags;
-			if (frags == sblock.fs_frag)
-				cgp->cg_cs.cs_nbfree--;
-			else
-				cgp->cg_cs.cs_nffree -= frags;
-			cgdirty(cgbp);
-			return (i + j);
+	for (blkno = cgdata(&sblock, startcg);
+	     blkno < maxfsblock - sblock.fs_frag;
+	     blkno += sblock.fs_frag) {
+		if ((newblk = (*checkblkavail)(blkno, frags)) == 0)
+			continue;
+		if (newblk > 0)
+			return (newblk);
+		if (newblk < 0)
+			blkno = -newblk;
+	}
+	for (blkno = cgdata(&sblock, 0);
+	     blkno < cgbase(&sblock, startcg) - sblock.fs_frag;
+	     blkno += sblock.fs_frag) {
+		if ((newblk = (*checkblkavail)(blkno, frags)) == 0)
+			continue;
+		if (newblk > 0)
+			return (newblk);
+		if (newblk < 0)
+			blkno = -newblk;
+	}
+	return (0);
+}
+
+ufs2_daddr_t
+std_checkblkavail(blkno, frags)
+	ufs2_daddr_t blkno;
+	long frags;
+{
+	struct bufarea *cgbp;
+	struct cg *cgp;
+	ufs2_daddr_t j, k, baseblk;
+	long cg;
+
+	for (j = 0; j <= sblock.fs_frag - frags; j++) {
+		if (testbmap(blkno + j))
+			continue;
+		for (k = 1; k < frags; k++)
+			if (testbmap(blkno + j + k))
+				break;
+		if (k < frags) {
+			j += k;
+			continue;
 		}
+		cg = dtog(&sblock, blkno + j);
+		cgbp = cglookup(cg);
+		cgp = cgbp->b_un.b_cg;
+		if (!check_cgmagic(cg, cgbp, 0))
+			return (-((cg + 1) * sblock.fs_fpg - sblock.fs_frag));
+		baseblk = dtogd(&sblock, blkno + j);
+		for (k = 0; k < frags; k++) {
+			setbmap(blkno + j + k);
+			clrbit(cg_blksfree(cgp), baseblk + k);
+		}
+		n_blks += frags;
+		if (frags == sblock.fs_frag)
+			cgp->cg_cs.cs_nbfree--;
+		else
+			cgp->cg_cs.cs_nffree -= frags;
+		cgdirty(cgbp);
+		return (blkno + j);
 	}
 	return (0);
 }
@@ -1261,14 +1344,19 @@ dofix(struct inodesc *idesc, const char *msg)
 /*
  * Print details about a buffer.
  */
-static void
-prtbuf(const char *msg, struct bufarea *bp)
+void
+prtbuf(struct bufarea *bp, const char *fmt, ...)
 {
-	
-	printf("%s: bp %p, type %s, bno %jd, size %d, refcnt %d, flags %s, "
-	    "index %jd\n", msg, bp, BT_BUFTYPE(bp->b_type),
-	    (intmax_t) bp->b_bno, bp->b_size, bp->b_refcnt,
-	    bp->b_flags & B_DIRTY ? "dirty" : "clean", (intmax_t) bp->b_index);
+	va_list ap;
+	va_start(ap, fmt);
+	if (preen)
+		(void)fprintf(stdout, "%s: ", cdevname);
+	(void)vfprintf(stdout, fmt, ap);
+	va_end(ap);
+	printf(": bp %p, type %s, bno %jd, size %d, refcnt %d, flags %s, "
+	    "index %jd\n", bp, BT_BUFTYPE(bp->b_type), (intmax_t) bp->b_bno,
+	    bp->b_size, bp->b_refcnt, bp->b_flags & B_DIRTY ? "dirty" : "clean",
+	    (intmax_t) bp->b_index);
 }
 
 /*
diff --git a/sbin/fsck_ffs/inode.c b/sbin/fsck_ffs/inode.c
index c261b254ace9..1d36a6ca3e45 100644
--- a/sbin/fsck_ffs/inode.c
+++ b/sbin/fsck_ffs/inode.c
@@ -38,6 +38,7 @@ static const char sccsid[] = "@(#)inode.c	8.8 (Berkeley) 4/28/95";
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/stat.h>
 #include <sys/stdint.h>
 #include <sys/sysctl.h>
 
@@ -58,6 +59,9 @@ struct bufarea *icachebp;	/* inode cache buffer */
 static int iblock(struct inodesc *, off_t isize, int type);
 static ufs2_daddr_t indir_blkatoff(ufs2_daddr_t, ino_t, ufs_lbn_t, ufs_lbn_t,
     struct bufarea **);
+static int snapclean(struct inodesc *idesc);
+static void chkcopyonwrite(struct fs *, ufs2_daddr_t,
+    ufs2_daddr_t (*checkblkavail)(long, long));
 
 int
 ckinode(union dinode *dp, struct inodesc *idesc)
@@ -378,8 +382,12 @@ chkrange(ufs2_daddr_t blk, int cnt)
 	int c;
 
 	if (cnt <= 0 || blk <= 0 || blk > maxfsblock ||
-	    cnt - 1 > maxfsblock - blk)
+	    cnt - 1 > maxfsblock - blk) {
+		if (debug)
+			printf("out of range: blk %ld, offset %i, size %d\n",
+			    (long)blk, (int)fragnum(&sblock, blk), cnt);
 		return (1);
+	}
 	if (cnt > sblock.fs_frag ||
 	    fragnum(&sblock, blk) + cnt > sblock.fs_frag) {
 		if (debug)
@@ -650,11 +658,21 @@ int
 freeblock(struct inodesc *idesc)
 {
 	struct dups *dlp;
+	struct bufarea *cgbp;
+	struct cg *cgp;
 	ufs2_daddr_t blkno;
-	long nfrags, res;
+	long size, nfrags, res;
 
 	res = KEEPON;
 	blkno = idesc->id_blkno;
+	if (idesc->id_type == SNAP) {
+		pfatal("clearing a snapshot dinode\n");
+		return (STOP);
+	}
+	size = lfragtosize(&sblock, idesc->id_numfrags);
+	if (snapblkfree(&sblock, blkno, size, idesc->id_number,
+	    std_checkblkavail))
+		return (res);
 	for (nfrags = idesc->id_numfrags; nfrags > 0; blkno++, nfrags--) {
 		if (chkrange(blkno, 1)) {
 			res = SKIP;
@@ -674,12 +692,407 @@ freeblock(struct inodesc *idesc)
 			}
 		}
 	}
+	/*
+	 * If all successfully returned, account for them.
+	 */
+	if (nfrags == 0) {
+		cgbp = cglookup(dtog(&sblock, idesc->id_blkno));
+		cgp = cgbp->b_un.b_cg;
+		if (idesc->id_numfrags == sblock.fs_frag)
+			cgp->cg_cs.cs_nbfree++;
+		else
+			cgp->cg_cs.cs_nffree += idesc->id_numfrags;
+		cgdirty(cgbp);
+	}
 	return (res);
 }
 
+/*
+ * Prepare a snapshot file for being removed.
+ */
+void
+snapremove(ino_t inum)
+{
+	struct inodesc idesc;
+	struct inode ip;
+	int i;
+
+	for (i = 0; i < snapcnt; i++)
+		if (snaplist[i].i_number == inum)
+			break;
+	if (i == snapcnt)
+		ginode(inum, &ip);
+	else
+		ip = snaplist[i];
+	if ((DIP(ip.i_dp, di_flags) & SF_SNAPSHOT) == 0) {
+		printf("snapremove: inode %jd is not a snapshot\n",
+		    (intmax_t)inum);
+		if (i == snapcnt)
+			irelse(&ip);
+		return;
+	}
+	if (debug)
+		printf("snapremove: remove %sactive snapshot %jd\n",
+		    i == snapcnt ? "in" : "", (intmax_t)inum);
+	/*
+	 * If on active snapshot list, remove it.
+	 */
+	if (i < snapcnt) {
+		for (i++; i < FSMAXSNAP; i++) {
+			if (sblock.fs_snapinum[i] == 0)
+				break;
+			snaplist[i - 1] = snaplist[i];
+			sblock.fs_snapinum[i - 1] = sblock.fs_snapinum[i];
+		}
+		sblock.fs_snapinum[i - 1] = 0;
+		bzero(&snaplist[i - 1], sizeof(struct inode));
+		snapcnt--;
+	}
+	idesc.id_type = SNAP;
+	idesc.id_func = snapclean;
+	idesc.id_number = inum;
+	(void)ckinode(ip.i_dp, &idesc);
+	DIP_SET(ip.i_dp, di_flags, DIP(ip.i_dp, di_flags) & ~SF_SNAPSHOT);
+	inodirty(&ip);
+	irelse(&ip);
+}
+
+static int
+snapclean(struct inodesc *idesc)
+{
+	ufs2_daddr_t blkno;
+	struct bufarea *bp;
+	union dinode *dp;
+
+	blkno = idesc->id_blkno;
+	if (blkno == 0)
+		return (KEEPON);
+
+	bp = idesc->id_bp;
+	dp = idesc->id_dp;
+	if (blkno == BLK_NOCOPY || blkno == BLK_SNAP) {
+		if (idesc->id_lbn < UFS_NDADDR)
+			DIP_SET(dp, di_db[idesc->id_lbn], 0);
+		else
+			IBLK_SET(bp, bp->b_index, 0);
+		dirty(bp);
+	}
+	return (KEEPON);
+}
+
+/*
+ * Notification that a block is being freed. Return zero if the free
+ * should be allowed to proceed. Return non-zero if the snapshot file
+ * wants to claim the block. The block will be claimed if it is an
+ * uncopied part of one of the snapshots. It will be freed if it is
+ * either a BLK_NOCOPY or has already been copied in all of the snapshots.
+ * If a fragment is being freed, then all snapshots that care about
+ * it must make a copy since a snapshot file can only claim full sized
+ * blocks. Note that if more than one snapshot file maps the block,
+ * we can pick one at random to claim it. Since none of the snapshots
+ * can change, we are assurred that they will all see the same unmodified
+ * image. When deleting a snapshot file (see ino_trunc above), we
+ * must push any of these claimed blocks to one of the other snapshots
+ * that maps it. These claimed blocks are easily identified as they will
+ * have a block number equal to their logical block number within the
+ * snapshot. A copied block can never have this property because they
+ * must always have been allocated from a BLK_NOCOPY location.
+ */
+int
+snapblkfree(fs, bno, size, inum, checkblkavail)
+	struct fs *fs;
+	ufs2_daddr_t bno;
+	long size;
+	ino_t inum;
+	ufs2_daddr_t (*checkblkavail)(long cg, long frags);
+{
+	union dinode *dp;
+	struct inode ip;
+	struct bufarea *snapbp;
+	ufs_lbn_t lbn;
+	ufs2_daddr_t blkno, relblkno;
+	int i, frags, claimedblk, copydone;
+
+	/* If no snapshots, nothing to do */
+	if (snapcnt == 0)
+		return (0);
+	if (debug)
+		printf("snapblkfree: in ino %ld free blkno %ld, size %ld\n",
+		    inum, bno, size);
+	relblkno = blknum(fs, bno);
+	lbn = fragstoblks(fs, relblkno);
+	/* Direct blocks are always pre-copied */
+	if (lbn < UFS_NDADDR)
+		return (0);
+	copydone = 0;
+	claimedblk = 0;
+	for (i = 0; i < snapcnt; i++) {
+		/*
+		 * Lookup block being freed.
+		 */
+		ip = snaplist[i];
+		dp = ip.i_dp;
+		blkno = ino_blkatoff(dp, inum != 0 ? inum : ip.i_number,
+		    lbn, &frags, &snapbp);
+		/*
+		 * Check to see if block needs to be copied.
+		 */
+		if (blkno == 0) {
+			/*
+			 * A block that we map is being freed. If it has not
+			 * been claimed yet, we will claim or copy it (below).
+			 */
+			claimedblk = 1;
+		} else if (blkno == BLK_SNAP) {
+			/*
+			 * No previous snapshot claimed the block,
+			 * so it will be freed and become a BLK_NOCOPY
+			 * (don't care) for us.
+			 */
+			if (claimedblk)
+				pfatal("snapblkfree: inconsistent block type");
+			IBLK_SET(snapbp, snapbp->b_index, BLK_NOCOPY);
+			dirty(snapbp);
+			brelse(snapbp);
+			continue;
+		} else /* BLK_NOCOPY or default */ {
+			/*
+			 * If the snapshot has already copied the block
+			 * (default), or does not care about the block,
+			 * it is not needed.
+			 */
+			brelse(snapbp);
+			continue;
+		}
+		/*
+		 * If this is a full size block, we will just grab it
+		 * and assign it to the snapshot inode. Otherwise we
+		 * will proceed to copy it. See explanation for this
+		 * routine as to why only a single snapshot needs to
+		 * claim this block.
+		 */
+		if (size == fs->fs_bsize) {
+			if (debug)
+				printf("Grabonremove snapshot %ju lbn %jd "
+				    "from inum %ju\n", (intmax_t)ip.i_number,
+				    (intmax_t)lbn, (uintmax_t)inum);
+			IBLK_SET(snapbp, snapbp->b_index, relblkno);
+			dirty(snapbp);
+			brelse(snapbp);
+			DIP_SET(dp, di_blocks,
+			    DIP(dp, di_blocks) + btodb(size));
+			inodirty(&ip);
+			return (1);
+		}
+
+		/* First time through, read the contents of the old block. */
+		if (copydone == 0) {
+			copydone = 1;
+			if (blread(fsreadfd, copybuf, fsbtodb(fs, relblkno),
+			    fs->fs_bsize) != 0) {
+				pfatal("Could not read snapshot %ju block "
+				    "%jd\n", (intmax_t)ip.i_number,
+				    (intmax_t)relblkno);
+				continue;
+			}
+		}
+		/*
+		 * This allocation will never require any additional
+		 * allocations for the snapshot inode.
+		 */
+		blkno = allocblk(dtog(fs, relblkno), fs->fs_frag,
+		    checkblkavail);
+		if (blkno == 0) {
+			pfatal("Could not allocate block for snapshot %ju\n",
+			    (intmax_t)ip.i_number);
+			continue;
+		}
+		if (debug)
+			printf("Copyonremove: snapino %jd lbn %jd for inum %ju "
+			    "size %ld new blkno %jd\n", (intmax_t)ip.i_number,
+			    (intmax_t)lbn, (uintmax_t)inum, size,
+			    (intmax_t)blkno);
+		blwrite(fswritefd, copybuf, fsbtodb(fs, blkno), fs->fs_bsize);
+		IBLK_SET(snapbp, snapbp->b_index, blkno);
+		dirty(snapbp);
+		brelse(snapbp);
+		DIP_SET(dp, di_blocks,
+		    DIP(dp, di_blocks) + btodb(fs->fs_bsize));
+		inodirty(&ip);
+	}
+	return (0);
+}
+
+/*
+ * Notification that a block is being written. Return if the block
+ * is part of a snapshot as snapshots never track other snapshots.
+ * The block will be copied in all of the snapshots that are tracking
+ * it and have not yet copied it. Some buffers may hold more than one
+ * block. Here we need to check each block in the buffer.
+ */
+void
+copyonwrite(fs, bp, checkblkavail)
+	struct fs *fs;
+	struct bufarea *bp;
+	ufs2_daddr_t (*checkblkavail)(long cg, long frags);
+{
+	ufs2_daddr_t copyblkno;
+	long i, numblks;
+
+	/* If no snapshots, nothing to do. */
+	if (snapcnt == 0)
+		return;
+	numblks = blkroundup(fs, bp->b_size) / fs->fs_bsize;
+	if (debug)
+		prtbuf(bp, "copyonwrite: checking %jd block%s in buffer",
*** 632 LINES SKIPPED ***