svn commit: r203012 - in projects/suj/head: lib/libufs
sbin/fsck_ffs sbin/mount sbin/tunefs sys/sys sys/ufs/ffs sys/ufs/ufs
Jeff Roberson
jroberson at jroberson.net
Tue Jan 26 06:38:12 UTC 2010
I forgot to mention, this change breaks backwards compat with earlier suj
releases. I think this is the last fs.h revision I will make for some
time to come.
Jeff
On Tue, 26 Jan 2010, Jeff Roberson wrote:
> Author: jeff
> Date: Tue Jan 26 06:36:10 2010
> New Revision: 203012
> URL: http://svn.freebsd.org/changeset/base/203012
>
> Log:
> - Move the softdep journal inode into the namespace at /.sujournal. This
> requires quite a lot of code as tunefs needs to be able to create
> directory entries in ROOTINO. However this is much cleaner from a
> compat standpoint. The inode is marked IMMUTABLE and only readable by
> root. Eventually the kernel will prevent clearing of the IMMUTABLE bit.
> - Fix a nasty link count bug involving changedirectory_offset(). When
> a link may exist at more than one location depending on when the
> directory block was written we create duplicate addref records. When
> an add and a remove are detected at the same offset the remove is
> discarded based on the assumption that it cancels the link in the add.
> A legitimate remove may collide with one of these alternate offset adds
> that are created by fsck and be discarded even though it removed a real
> link. To resolve this the lineage of the addref must be established
> to determine whether the remove refers to an alternate address or not.
> Any offset which is not up-to-date with respect to the offset in the
> move record is considered alternate and will not discard a remove.
> - Use clear_remove() when we begin to exhaust dependencies to prevent
> excessive looping in request_cleanup(). This should probably
> also be done in softdep_fsync(). Only workloads which delete
> incredible numbers of files within the same directory would be
> affected. stress2 can generate over 100,000 outstanding removes on
> my test machine.
>
> Modified:
> projects/suj/head/lib/libufs/cgroup.c
> projects/suj/head/lib/libufs/libufs.h
> projects/suj/head/sbin/fsck_ffs/pass4.c
> projects/suj/head/sbin/fsck_ffs/suj.c
> projects/suj/head/sbin/mount/mount.c
> projects/suj/head/sbin/tunefs/tunefs.c
> projects/suj/head/sys/sys/mount.h
> projects/suj/head/sys/ufs/ffs/ffs_alloc.c
> projects/suj/head/sys/ufs/ffs/ffs_softdep.c
> projects/suj/head/sys/ufs/ffs/ffs_vfsops.c
> projects/suj/head/sys/ufs/ffs/fs.h
> projects/suj/head/sys/ufs/ufs/inode.h
>
> Modified: projects/suj/head/lib/libufs/cgroup.c
> ==============================================================================
> --- projects/suj/head/lib/libufs/cgroup.c Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/lib/libufs/cgroup.c Tue Jan 26 06:36:10 2010 (r203012)
> @@ -71,6 +71,67 @@ gotit:
> return (cgbase(fs, cgp->cg_cgx) + blkstofrags(fs, bno));
> }
>
> +int
> +cgbfree(struct uufsd *disk, ufs2_daddr_t bno, long size)
> +{
> + u_int8_t *blksfree;
> + struct fs *fs;
> + struct cg *cgp;
> + ufs1_daddr_t fragno, cgbno;
> + int i, cg, blk, frags, bbase;
> +
> + fs = &disk->d_fs;
> + cg = dtog(fs, bno);
> + if (cgread1(disk, cg) != 1)
> + return (-1);
> + cgp = &disk->d_cg;
> + cgbno = dtogd(fs, bno);
> + blksfree = cg_blksfree(cgp);
> + if (size == fs->fs_bsize) {
> + fragno = fragstoblks(fs, cgbno);
> + ffs_setblock(fs, blksfree, fragno);
> + ffs_clusteracct(fs, cgp, fragno, 1);
> + cgp->cg_cs.cs_nbfree++;
> + fs->fs_cstotal.cs_nbfree++;
> + fs->fs_cs(fs, cg).cs_nbfree++;
> + } else {
> + bbase = cgbno - fragnum(fs, cgbno);
> + /*
> + * decrement the counts associated with the old frags
> + */
> + blk = blkmap(fs, blksfree, bbase);
> + ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
> + /*
> + * deallocate the fragment
> + */
> + frags = numfrags(fs, size);
> + for (i = 0; i < frags; i++)
> + setbit(blksfree, cgbno + i);
> + cgp->cg_cs.cs_nffree += i;
> + fs->fs_cstotal.cs_nffree += i;
> + fs->fs_cs(fs, cg).cs_nffree += i;
> + /*
> + * add back in counts associated with the new frags
> + */
> + blk = blkmap(fs, blksfree, bbase);
> + ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
> + /*
> + * if a complete block has been reassembled, account for it
> + */
> + fragno = fragstoblks(fs, bbase);
> + if (ffs_isblock(fs, blksfree, fragno)) {
> + cgp->cg_cs.cs_nffree -= fs->fs_frag;
> + fs->fs_cstotal.cs_nffree -= fs->fs_frag;
> + fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
> + ffs_clusteracct(fs, cgp, fragno, 1);
> + cgp->cg_cs.cs_nbfree++;
> + fs->fs_cstotal.cs_nbfree++;
> + fs->fs_cs(fs, cg).cs_nbfree++;
> + }
> + }
> + return cgwrite(disk);
> +}
> +
> ino_t
> cgialloc(struct uufsd *disk)
> {
>
> Modified: projects/suj/head/lib/libufs/libufs.h
> ==============================================================================
> --- projects/suj/head/lib/libufs/libufs.h Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/lib/libufs/libufs.h Tue Jan 26 06:36:10 2010 (r203012)
> @@ -111,6 +111,7 @@ int berase(struct uufsd *, ufs2_daddr_t,
> * cgroup.c
> */
> ufs2_daddr_t cgballoc(struct uufsd *);
> +int cgbfree(struct uufsd *, ufs2_daddr_t, long);
> ino_t cgialloc(struct uufsd *);
> int cgread(struct uufsd *);
> int cgread1(struct uufsd *, int);
>
> Modified: projects/suj/head/sbin/fsck_ffs/pass4.c
> ==============================================================================
> --- projects/suj/head/sbin/fsck_ffs/pass4.c Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/sbin/fsck_ffs/pass4.c Tue Jan 26 06:36:10 2010 (r203012)
> @@ -72,9 +72,6 @@ pass4(void)
> for (i = 0; i < inostathead[cg].il_numalloced; i++, inumber++) {
> if (inumber < ROOTINO)
> continue;
> - if (sblock.fs_flags & FS_SUJ &&
> - inumber == sblock.fs_sujournal)
> - continue;
> idesc.id_number = inumber;
> switch (inoinfo(inumber)->ino_state) {
>
>
> Modified: projects/suj/head/sbin/fsck_ffs/suj.c
> ==============================================================================
> --- projects/suj/head/sbin/fsck_ffs/suj.c Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/sbin/fsck_ffs/suj.c Tue Jan 26 06:36:10 2010 (r203012)
> @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
> #include <stdlib.h>
> #include <stdint.h>
> #include <libufs.h>
> +#include <string.h>
> #include <strings.h>
> #include <err.h>
> #include <assert.h>
> @@ -63,6 +64,7 @@ struct suj_seg {
> struct suj_rec {
> TAILQ_ENTRY(suj_rec) sr_next;
> union jrec *sr_rec;
> + int sr_alt; /* Is alternate address? */
> };
> TAILQ_HEAD(srechd, suj_rec);
>
> @@ -127,6 +129,7 @@ TAILQ_HEAD(seghd, suj_seg) allsegs;
> uint64_t oldseq;
> static struct uufsd *disk = NULL;
> static struct fs *fs = NULL;
> +ino_t sujino;
>
> /*
> * Summary statistics.
> @@ -191,8 +194,7 @@ closedisk(const char *devnam)
> fs->fs_cstotal.cs_nifree += cgsum->cs_nifree;
> fs->fs_cstotal.cs_ndir += cgsum->cs_ndir;
> }
> - /* XXX Don't set clean for now, we don't trust the journal. */
> - /* fs->fs_clean = 1; */
> + fs->fs_clean = 1;
> fs->fs_time = time(NULL);
> fs->fs_mtime = time(NULL);
> if (sbwrite(disk, 0) == -1)
> @@ -1823,6 +1825,7 @@ ino_append(union jrec *rec)
> sino->si_hasrecs = 1;
> srec = errmalloc(sizeof(*srec));
> srec->sr_rec = rec;
> + srec->sr_alt = 0;
> TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next);
> }
>
> @@ -1844,9 +1847,10 @@ ino_build_ref(struct suj_ino *sino, stru
>
> refrec = (struct jrefrec *)srec->sr_rec;
> if (debug)
> - printf("ino_build: op %d, ino %d, nlink %d, parent %d, diroff %jd\n",
> - refrec->jr_op, refrec->jr_ino, refrec->jr_nlink, refrec->jr_parent,
> - refrec->jr_diroff);
> + printf("ino_build: op %d, ino %d, nlink %d, "
> + "parent %d, diroff %jd\n",
> + refrec->jr_op, refrec->jr_ino, refrec->jr_nlink,
> + refrec->jr_parent, refrec->jr_diroff);
>
> /*
> * Search for a mvrec that matches this offset. Whether it's an add
> @@ -1871,16 +1875,19 @@ ino_build_ref(struct suj_ino *sino, stru
> rrn = errmalloc(sizeof(*refrec));
> *rrn = *refrec;
> rrn->jr_op = JOP_ADDREF;
> + rrn->jr_diroff = mvrec->jm_oldoff;
> srn = errmalloc(sizeof(*srec));
> + srn->sr_alt = 1;
> srn->sr_rec = (union jrec *)rrn;
> ino_build_ref(sino, srn);
> - refrec->jr_diroff = mvrec->jm_oldoff;
> }
> }
> }
> /*
> * We walk backwards so that adds and removes are evaluated in the
> - * correct order.
> + * correct order. If a primary record conflicts with an alt keep
> + * the primary and discard the alt. We must track this to keep
> + * the correct number of removes in the list.
> */
> for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
> srn = TAILQ_PREV(srn, srechd, sr_next)) {
> @@ -1890,7 +1897,17 @@ ino_build_ref(struct suj_ino *sino, stru
> continue;
> if (debug)
> printf("Discarding dup.\n");
> - rrn->jr_mode = refrec->jr_mode;
> + if (srn->sr_alt == 0) {
> + rrn->jr_mode = refrec->jr_mode;
> + return;
> + }
> + /*
> + * Replace the record in place with the old nlink in case
> + * we replace the head of the list. Abandon srec as a dup.
> + */
> + refrec->jr_nlink = rrn->jr_nlink;
> + srn->sr_rec = srec->sr_rec;
> + srn->sr_alt = srec->sr_alt;
> return;
> }
> TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next);
> @@ -1930,9 +1947,12 @@ ino_move_ref(struct suj_ino *sino, struc
> /*
> * When an entry is moved we don't know whether the write
> * to move has completed yet. To resolve this we create
> - * a new add dependency in the new location as if it were added
> - * twice. Only one will succeed.
> + * a new add dependency in the new location as if it were
> + * added twice. Only one will succeed. Consider the
> + * new offset the primary location for the inode and the
> + * old offset the alt.
> */
> + srn->sr_alt = 1;
> refrec = errmalloc(sizeof(*refrec));
> refrec->jr_op = JOP_ADDREF;
> refrec->jr_ino = mvrec->jm_ino;
> @@ -1941,12 +1961,14 @@ ino_move_ref(struct suj_ino *sino, struc
> refrec->jr_mode = rrn->jr_mode;
> refrec->jr_nlink = rrn->jr_nlink;
> srn = errmalloc(sizeof(*srn));
> + srn->sr_alt = 0;
> srn->sr_rec = (union jrec *)refrec;
> ino_build_ref(sino, srn);
> break;
> }
> /*
> - * Add this mvrec to the queue of pending mvs.
> + * Add this mvrec to the queue of pending mvs, possibly collapsing
> + * it with a prior move for the same inode and offset.
> */
> for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn;
> srn = TAILQ_PREV(srn, srechd, sr_next)) {
> @@ -2195,19 +2217,25 @@ suj_verifyino(union dinode *ip)
>
> if (DIP(ip, di_nlink) != 1) {
> printf("Invalid link count %d for journal inode %d\n",
> - DIP(ip, di_nlink), fs->fs_sujournal);
> + DIP(ip, di_nlink), sujino);
> + return (-1);
> + }
> +
> + if (DIP(ip, di_flags) != (SF_IMMUTABLE | SF_NOUNLINK)) {
> + printf("Invalid flags 0x%X for journal inode %d\n",
> + DIP(ip, di_flags), sujino);
> return (-1);
> }
>
> - if (DIP(ip, di_mode) != IFREG) {
> - printf("Invalid mode %d for journal inode %d\n",
> - DIP(ip, di_mode), fs->fs_sujournal);
> + if (DIP(ip, di_mode) != (IFREG | IREAD)) {
> + printf("Invalid mode %o for journal inode %d\n",
> + DIP(ip, di_mode), sujino);
> return (-1);
> }
>
> if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX) {
> printf("Invalid size %jd for journal inode %d\n",
> - DIP(ip, di_size), fs->fs_sujournal);
> + DIP(ip, di_size), sujino);
> return (-1);
> }
>
> @@ -2447,20 +2475,60 @@ restart:
> }
>
> /*
> + * Search a directory block for the SUJ_FILE.
> + */
> +static void
> +suj_find(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
> +{
> + char block[MAXBSIZE];
> + struct direct *dp;
> + int bytes;
> + int off;
> +
> + if (sujino)
> + return;
> + bytes = lfragtosize(fs, frags);
> + if (bread(disk, fsbtodb(fs, blk), block, bytes) <= 0)
> + err(1, "Failed to read ROOTINO directory block %jd", blk);
> + for (off = 0; off < bytes; off += dp->d_reclen) {
> + dp = (struct direct *)&block[off];
> + if (dp->d_reclen == 0)
> + break;
> + if (dp->d_ino == 0)
> + continue;
> + if (dp->d_namlen != strlen(SUJ_FILE))
> + continue;
> + if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
> + continue;
> + sujino = dp->d_ino;
> + return;
> + }
> +}
> +
> +/*
> * Orchestrate the verification of a filesystem via the softupdates journal.
> */
> int
> suj_check(const char *filesys)
> {
> union dinode *jip;
> + union dinode *ip;
> uint64_t blocks;
>
> opendisk(filesys);
> TAILQ_INIT(&allsegs);
> /*
> + * Find the journal inode.
> + */
> + ip = ino_read(ROOTINO);
> + sujino = 0;
> + ino_visit(ip, ROOTINO, suj_find, 0);
> + if (sujino == 0)
> + errx(1, "Journal inode removed. Use tunefs to re-create.");
> + /*
> * Fetch the journal inode and verify it.
> */
> - jip = ino_read(fs->fs_sujournal);
> + jip = ino_read(sujino);
> printf("** SU+J Recovering %s\n", filesys);
> if (suj_verifyino(jip) != 0)
> return (-1);
> @@ -2469,11 +2537,11 @@ suj_check(const char *filesys)
> * available journal blocks in with suj_read().
> */
> printf("** Reading %jd byte journal from inode %d.\n",
> - DIP(jip, di_size), fs->fs_sujournal);
> + DIP(jip, di_size), sujino);
> suj_jblocks = jblocks_create();
> - blocks = ino_visit(jip, fs->fs_sujournal, suj_add_block, 0);
> + blocks = ino_visit(jip, sujino, suj_add_block, 0);
> if (blocks != numfrags(fs, DIP(jip, di_size)))
> - errx(1, "Sparse journal inode %d.\n", fs->fs_sujournal);
> + errx(1, "Sparse journal inode %d.\n", sujino);
> suj_read();
> jblocks_destroy(suj_jblocks);
> suj_jblocks = NULL;
>
> Modified: projects/suj/head/sbin/mount/mount.c
> ==============================================================================
> --- projects/suj/head/sbin/mount/mount.c Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/sbin/mount/mount.c Tue Jan 26 06:36:10 2010 (r203012)
> @@ -113,7 +113,6 @@ static struct opt {
> { MNT_ACLS, "acls" },
> { MNT_NFS4ACLS, "nfsv4acls" },
> { MNT_GJOURNAL, "gjournal" },
> - { MNT_SUJ, "journal" }, /* always soft-updates, journal */
> { 0, NULL }
> };
>
>
> Modified: projects/suj/head/sbin/tunefs/tunefs.c
> ==============================================================================
> --- projects/suj/head/sbin/tunefs/tunefs.c Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/sbin/tunefs/tunefs.c Tue Jan 26 06:36:10 2010 (r203012)
> @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
> #include <ufs/ufs/ufsmount.h>
> #include <ufs/ufs/dinode.h>
> #include <ufs/ffs/fs.h>
> +#include <ufs/ufs/dir.h>
>
> #include <ctype.h>
> #include <err.h>
> @@ -74,6 +75,7 @@ struct uufsd disk;
> void usage(void);
> void printfs(void);
> int journal_alloc(int64_t size);
> +void journal_clear(void);
> void sbdirty(void);
>
> int
> @@ -355,11 +357,11 @@ main(int argc, char *argv[])
> if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) {
> warnx("%s remains unchanged as disabled", name);
> } else {
> - sbdirty();
> + journal_clear();
> sblock.fs_flags &= ~(FS_DOSOFTDEP | FS_SUJ);
> - sblock.fs_sujournal = 0;
> sblock.fs_sujfree = 0;
> - warnx("%s cleared", name);
> + warnx("%s cleared, "
> + "remove .sujournal to reclaim space", name);
> }
> }
> }
> @@ -523,11 +525,9 @@ journal_balloc(void)
> {
> ufs2_daddr_t blk;
> struct cg *cgp;
> - struct fs *fs;
> int valid;
>
> cgp = &disk.d_cg;
> - fs = &disk.d_fs;
> for (;;) {
> blk = cgballoc(&disk);
> if (blk > 0)
> @@ -553,13 +553,231 @@ journal_balloc(void)
> warnx("Failed to find sufficient free blocks for the journal");
> return -1;
> }
> - if (bwrite(&disk, fsbtodb(fs, blk), clrbuf, fs->fs_bsize) <= 0) {
> + if (bwrite(&disk, fsbtodb(&sblock, blk), clrbuf,
> + sblock.fs_bsize) <= 0) {
> warn("Failed to initialize new block");
> return -1;
> }
> return (blk);
> }
>
> +/*
> + * Search a directory block for the SUJ_FILE.
> + */
> +static ino_t
> +dir_search(ufs2_daddr_t blk, int bytes)
> +{
> + char block[MAXBSIZE];
> + struct direct *dp;
> + int off;
> +
> + if (bread(&disk, fsbtodb(&sblock, blk), block, bytes) <= 0) {
> + warn("Failed to read dir block");
> + return (-1);
> + }
> + for (off = 0; off < bytes; off += dp->d_reclen) {
> + dp = (struct direct *)&block[off];
> + if (dp->d_reclen == 0)
> + break;
> + if (dp->d_ino == 0)
> + continue;
> + if (dp->d_namlen != strlen(SUJ_FILE))
> + continue;
> + if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
> + continue;
> + return (dp->d_ino);
> + }
> +
> + return (0);
> +}
> +
> +/*
> + * Search in the ROOTINO for the SUJ_FILE. If it exists we can not enable
> + * journaling.
> + */
> +static ino_t
> +journal_findfile(void)
> +{
> + struct ufs1_dinode *dp1;
> + struct ufs2_dinode *dp2;
> + int mode;
> + void *ip;
> + int i;
> +
> + if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
> + warn("Failed to get root inode");
> + return (-1);
> + }
> + dp2 = ip;
> + dp1 = ip;
> + if (sblock.fs_magic == FS_UFS1_MAGIC) {
> + if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
> + warnx("ROOTINO extends beyond direct blocks.");
> + return (-1);
> + }
> + for (i = 0; i < NDADDR; i++) {
> + if (dp1->di_db[i] == 0)
> + break;
> + if (dir_search(dp1->di_db[i],
> + sblksize(&sblock, (off_t)dp1->di_size, i)) != 0)
> + return (-1);
> + }
> + } else {
> + if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
> + warnx("ROOTINO extends beyond direct blocks.");
> + return (-1);
> + }
> + for (i = 0; i < NDADDR; i++) {
> + if (dp2->di_db[i] == 0)
> + break;
> + if (dir_search(dp2->di_db[i],
> + sblksize(&sblock, (off_t)dp2->di_size, i)) != 0)
> + return (-1);
> + }
> + }
> +
> + return (0);
> +}
> +
> +/*
> + * Insert the journal at inode 'ino' into directory blk 'blk' at the first
> + * free offset of 'off'. DIRBLKSIZ blocks after off are initialized as
> + * empty.
> + */
> +static int
> +dir_insert(ufs2_daddr_t blk, off_t off, ino_t ino)
> +{
> + struct direct *dp;
> + char block[MAXBSIZE];
> +
> + if (bread(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
> + warn("Failed to read dir block");
> + return (-1);
> + }
> + bzero(&block[off], sblock.fs_bsize - off);
> + dp = (struct direct *)&block[off];
> + dp->d_ino = ino;
> + dp->d_reclen = DIRBLKSIZ;
> + dp->d_type = DT_REG;
> + dp->d_namlen = strlen(SUJ_FILE);
> + bcopy(SUJ_FILE, &dp->d_name, strlen(SUJ_FILE));
> + off += DIRBLKSIZ;
> + for (; off < sblock.fs_bsize; off += DIRBLKSIZ) {
> + dp = (struct direct *)&block[off];
> + dp->d_ino = 0;
> + dp->d_reclen = DIRBLKSIZ;
> + dp->d_type = DT_UNKNOWN;
> + }
> + if (bwrite(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
> + warn("Failed to write dir block");
> + return (-1);
> + }
> + return (0);
> +}
> +
> +/*
> + * Extend a directory block in 'blk' by copying it to a full size block
> + * and inserting the new journal inode into .sujournal.
> + */
> +static int
> +dir_extend(ufs2_daddr_t blk, ufs2_daddr_t nblk, off_t size, ino_t ino)
> +{
> + char block[MAXBSIZE];
> +
> + if (bread(&disk, fsbtodb(&sblock, blk), block, size) <= 0) {
> + warn("Failed to read dir block");
> + return (-1);
> + }
> + if (bwrite(&disk, fsbtodb(&sblock, nblk), block, size) <= 0) {
> + warn("Failed to write dir block");
> + return (-1);
> + }
> +
> + return dir_insert(nblk, size, ino);
> +}
> +
> +/*
> + * Insert the journal file into the ROOTINO directory. We always extend the
> + * last frag
> + */
> +static int
> +journal_insertfile(ino_t ino)
> +{
> + struct ufs1_dinode *dp1;
> + struct ufs2_dinode *dp2;
> + void *ip;
> + ufs2_daddr_t nblk;
> + ufs2_daddr_t blk;
> + ufs_lbn_t lbn;
> + int size;
> + int mode;
> + int off;
> +
> + if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
> + warn("Failed to get root inode");
> + sbdirty();
> + return (-1);
> + }
> + dp2 = ip;
> + dp1 = ip;
> + blk = 0;
> + size = 0;
> + nblk = journal_balloc();
> + if (nblk <= 0)
> + return (-1);
> + /*
> + * For simplicity sake we aways extend the ROOTINO into a new
> + * directory block rather than searching for space and inserting
> + * into an existing block. However, if the rootino has frags
> + * have to free them and extend the block.
> + */
> + if (sblock.fs_magic == FS_UFS1_MAGIC) {
> + lbn = lblkno(&sblock, dp1->di_size);
> + off = blkoff(&sblock, dp1->di_size);
> + blk = dp1->di_db[lbn];
> + size = sblksize(&sblock, (off_t)dp1->di_size, lbn);
> + } else {
> + lbn = lblkno(&sblock, dp2->di_size);
> + off = blkoff(&sblock, dp2->di_size);
> + blk = dp2->di_db[lbn];
> + size = sblksize(&sblock, (off_t)dp2->di_size, lbn);
> + }
> + if (off != 0) {
> + if (dir_extend(blk, nblk, off, ino) == -1)
> + return (-1);
> + } else {
> + blk = 0;
> + if (dir_insert(nblk, 0, ino) == -1)
> + return (-1);
> + }
> + if (sblock.fs_magic == FS_UFS1_MAGIC) {
> + dp1->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
> + dp1->di_db[lbn] = nblk;
> + dp1->di_size = lblktosize(&sblock, lbn+1);
> + } else {
> + dp2->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
> + dp2->di_db[lbn] = nblk;
> + dp2->di_size = lblktosize(&sblock, lbn+1);
> + }
> + if (putino(&disk) < 0) {
> + warn("Failed to write root inode");
> + return (-1);
> + }
> + if (cgwrite(&disk) < 0) {
> + warn("Failed to write updated cg");
> + sbdirty();
> + return (-1);
> + }
> + if (blk) {
> + if (cgbfree(&disk, blk, size) < 0) {
> + warn("Failed to write cg");
> + return (-1);
> + }
> + }
> +
> + return (0);
> +}
> +
> static int
> indir_fill(ufs2_daddr_t blk, int level, int *resid)
> {
> @@ -567,22 +785,20 @@ indir_fill(ufs2_daddr_t blk, int level,
> ufs1_daddr_t *bap1;
> ufs2_daddr_t *bap2;
> ufs2_daddr_t nblk;
> - struct fs *fs;
> int ncnt;
> int cnt;
> int i;
>
> - fs = &disk.d_fs;
> bzero(indirbuf, sizeof(indirbuf));
> bap1 = (ufs1_daddr_t *)indirbuf;
> bap2 = (void *)bap1;
> cnt = 0;
> - for (i = 0; i < NINDIR(fs) && *resid != 0; i++) {
> + for (i = 0; i < NINDIR(&sblock) && *resid != 0; i++) {
> nblk = journal_balloc();
> if (nblk <= 0)
> return (-1);
> cnt++;
> - if (fs->fs_magic == FS_UFS1_MAGIC)
> + if (sblock.fs_magic == FS_UFS1_MAGIC)
> *bap1++ = nblk;
> else
> *bap2++ = nblk;
> @@ -594,13 +810,47 @@ indir_fill(ufs2_daddr_t blk, int level,
> } else
> (*resid)--;
> }
> - if (bwrite(&disk, fsbtodb(fs, blk), indirbuf, fs->fs_bsize) <= 0) {
> + if (bwrite(&disk, fsbtodb(&sblock, blk), indirbuf,
> + sblock.fs_bsize) <= 0) {
> warn("Failed to write indirect");
> return (-1);
> }
> return (cnt);
> }
>
> +/*
> + * Clear the flag bits so the journal can be removed.
> + */
> +void
> +journal_clear(void)
> +{
> + struct ufs1_dinode *dp1;
> + struct ufs2_dinode *dp2;
> + ino_t ino;
> + int mode;
> + void *ip;
> +
> + ino = journal_findfile();
> + if (ino <= 0) {
> + warnx("Journal file does not exist");
> + return;
> + }
> + if (getino(&disk, &ip, ino, &mode) != 0) {
> + warn("Failed to get journal inode");
> + return;
> + }
> + dp2 = ip;
> + dp1 = ip;
> + if (sblock.fs_magic == FS_UFS1_MAGIC)
> + dp1->di_flags = 0;
> + else
> + dp2->di_flags = 0;
> + if (putino(&disk) < 0) {
> + warn("Failed to write journal inode");
> + return;
> + }
> +}
> +
> int
> journal_alloc(int64_t size)
> {
> @@ -609,32 +859,39 @@ journal_alloc(int64_t size)
> ufs2_daddr_t blk;
> void *ip;
> struct cg *cgp;
> - struct fs *fs;
> int resid;
> ino_t ino;
> int blks;
> int mode;
> int i;
>
> - fs = &disk.d_fs;
> cgp = &disk.d_cg;
> ino = 0;
>
> /*
> + * If the journal file exists we can't allocate it.
> + */
> + ino = journal_findfile();
> + if (ino > 0)
> + warnx("Journal file %s already exists, please remove.",
> + SUJ_FILE);
> + if (ino != 0)
> + return (-1);
> + /*
> * If the user didn't supply a size pick one based on the filesystem
> * size constrained with hardcoded MIN and MAX values. We opt for
> * 1/1024th of the filesystem up to MAX but not exceeding one CG and
> * not less than the MIN.
> */
> if (size == 0) {
> - size = (fs->fs_size * fs->fs_bsize) / 1024;
> + size = (sblock.fs_size * sblock.fs_bsize) / 1024;
> size = MIN(SUJ_MAX, size);
> - if (size / fs->fs_fsize > fs->fs_fpg)
> - size = fs->fs_fpg * fs->fs_fsize;
> + if (size / sblock.fs_fsize > sblock.fs_fpg)
> + size = sblock.fs_fpg * sblock.fs_fsize;
> size = MAX(SUJ_MIN, size);
> }
> - resid = blocks = size / fs->fs_bsize;
> - if (fs->fs_cstotal.cs_nbfree < blocks) {
> + resid = blocks = size / sblock.fs_bsize;
> + if (sblock.fs_cstotal.cs_nbfree < blocks) {
> warn("Insufficient free space for %jd byte journal", size);
> return (-1);
> }
> @@ -647,9 +904,9 @@ journal_alloc(int64_t size)
> continue;
> /*
> * Try to minimize fragmentation by requiring at least a
> - * 1/8th of the blocks be present in each cg we use.
> + * 1/16th of the blocks be present in each cg we use.
> */
> - if (cgp->cg_cs.cs_nbfree < blocks / 8)
> + if (cgp->cg_cs.cs_nbfree < blocks / 16)
> continue;
> ino = cgialloc(&disk);
> if (ino <= 0)
> @@ -668,22 +925,24 @@ journal_alloc(int64_t size)
> */
> dp2 = ip;
> dp1 = ip;
> - if (fs->fs_magic == FS_UFS1_MAGIC) {
> + if (sblock.fs_magic == FS_UFS1_MAGIC) {
> bzero(dp1, sizeof(*dp1));
> dp1->di_size = size;
> - dp1->di_mode = IFREG;
> + dp1->di_mode = IFREG | IREAD;
> dp1->di_nlink = 1;
> + dp1->di_flags = SF_IMMUTABLE | SF_NOUNLINK;
> } else {
> bzero(dp2, sizeof(*dp2));
> dp2->di_size = size;
> - dp2->di_mode = IFREG;
> + dp2->di_mode = IFREG | IREAD;
> dp2->di_nlink = 1;
> + dp2->di_flags = SF_IMMUTABLE | SF_NOUNLINK;
> }
> for (i = 0; i < NDADDR && resid; i++, resid--) {
> blk = journal_balloc();
> if (blk <= 0)
> goto out;
> - if (fs->fs_magic == FS_UFS1_MAGIC) {
> + if (sblock.fs_magic == FS_UFS1_MAGIC) {
> dp1->di_db[i] = blk;
> dp1->di_blocks++;
> } else {
> @@ -700,7 +959,7 @@ journal_alloc(int64_t size)
> sbdirty();
> goto out;
> }
> - if (fs->fs_magic == FS_UFS1_MAGIC) {
> + if (sblock.fs_magic == FS_UFS1_MAGIC) {
> dp1->di_ib[i] = blk;
> dp1->di_blocks += blks;
> } else {
> @@ -708,10 +967,10 @@ journal_alloc(int64_t size)
> dp2->di_blocks += blks;
> }
> }
> - if (fs->fs_magic == FS_UFS1_MAGIC)
> - dp1->di_blocks *= fs->fs_bsize / disk.d_bsize;
> + if (sblock.fs_magic == FS_UFS1_MAGIC)
> + dp1->di_blocks *= sblock.fs_bsize / disk.d_bsize;
> else
> - dp2->di_blocks *= fs->fs_bsize / disk.d_bsize;
> + dp2->di_blocks *= sblock.fs_bsize / disk.d_bsize;
> if (putino(&disk) < 0) {
> warn("Failed to write inode");
> sbdirty();
> @@ -722,8 +981,11 @@ journal_alloc(int64_t size)
> sbdirty();
> return (-1);
> }
> - fs->fs_sujournal = ino;
> - fs->fs_sujfree = 0;
> + if (journal_insertfile(ino) < 0) {
> + sbdirty();
> + return (-1);
> + }
> + sblock.fs_sujfree = 0;
> return (0);
> }
> warnx("Insufficient contiguous free space for the journal.");
>
> Modified: projects/suj/head/sys/sys/mount.h
> ==============================================================================
> --- projects/suj/head/sys/sys/mount.h Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/sys/sys/mount.h Tue Jan 26 06:36:10 2010 (r203012)
> @@ -240,7 +240,6 @@ void __mnt_vnode_markerfree(str
> #define MNT_NOCLUSTERR 0x40000000 /* disable cluster read */
> #define MNT_NOCLUSTERW 0x80000000 /* disable cluster write */
> #define MNT_NFS4ACLS 0x00000010
> -#define MNT_SUJ 0x00000080 /* softdep journaling */
>
> /*
> * NFS export related mount flags.
> @@ -277,7 +276,7 @@ void __mnt_vnode_markerfree(str
> MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \
> MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \
> MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \
> - MNT_NFS4ACLS | MNT_SUJ)
> + MNT_NFS4ACLS)
>
> /* Mask of flags that can be updated. */
> #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \
> @@ -326,6 +325,7 @@ void __mnt_vnode_markerfree(str
> #define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */
> #define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */
> #define MNTK_SHARED_WRITES 0x00000080 /* Allow shared locking for writes */
> +#define MNTK_SUJ 0x00000100 /* Softdep journaling enabled */
> #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */
> #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */
> #define MNTK_SUSPEND 0x08000000 /* request write suspension */
>
> Modified: projects/suj/head/sys/ufs/ffs/ffs_alloc.c
> ==============================================================================
> --- projects/suj/head/sys/ufs/ffs/ffs_alloc.c Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/sys/ufs/ffs/ffs_alloc.c Tue Jan 26 06:36:10 2010 (r203012)
> @@ -1851,6 +1851,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, i
> ino_t inum;
> struct workhead *dephd;
> {
> + struct mount *mp;
> struct cg *cgp;
> struct buf *bp;
> ufs1_daddr_t fragno, cgbno;
> @@ -1965,7 +1966,8 @@ ffs_blkfree(ump, fs, devvp, bno, size, i
> fs->fs_fmod = 1;
> ACTIVECLEAR(fs, cg);
> UFS_UNLOCK(ump);
> - if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP)
> + mp = UFSTOVFS(ump);
> + if (mp->mnt_flag & MNT_SOFTDEP)
> softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
> numfrags(fs, size), dephd);
> bdwrite(bp);
>
> Modified: projects/suj/head/sys/ufs/ffs/ffs_softdep.c
> ==============================================================================
> --- projects/suj/head/sys/ufs/ffs/ffs_softdep.c Tue Jan 26 05:17:03 2010 (r203011)
> +++ projects/suj/head/sys/ufs/ffs/ffs_softdep.c Tue Jan 26 06:36:10 2010 (r203012)
> @@ -1902,7 +1902,7 @@ softdep_unmount(mp)
> struct mount *mp;
> {
>
> - if (mp->mnt_flag & MNT_SUJ)
> + if (mp->mnt_kern_flag & MNTK_SUJ)
> journal_unmount(mp);
> }
>
> @@ -2044,16 +2044,36 @@ journal_mount(mp, fs, cred)
> struct fs *fs;
> struct ucred *cred;
> {
> + struct componentname cnp;
> struct jblocks *jblocks;
> + struct vnode *dvp;
> struct vnode *vp;
> struct inode *ip;
> ufs2_daddr_t blkno;
> + ino_t sujournal;
> int bcount;
> int error;
> int i;
>
> - mp->mnt_flag |= MNT_SUJ;
> - error = VFS_VGET(mp, fs->fs_sujournal, LK_EXCLUSIVE, &vp);
> + mp->mnt_kern_flag |= MNTK_SUJ;
> + error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
> + if (error)
> + return (error);
> + bzero(&cnp, sizeof(cnp));
> + cnp.cn_nameiop = LOOKUP;
> + cnp.cn_flags = ISLASTCN;
> + cnp.cn_thread = curthread;
> + cnp.cn_cred = curthread->td_ucred;
> + cnp.cn_pnbuf = SUJ_FILE;
> + cnp.cn_nameptr = SUJ_FILE;
> + cnp.cn_namelen = strlen(SUJ_FILE);
> + error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
> + vput(dvp);
> + if (error != 0) {
> + printf("Failed to find journal. Use tunefs to create one\n");
> + return (error);
> + }
> + error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, &vp);
> if (error)
> return (error);
> ip = VTOI(vp);
> @@ -2075,9 +2095,18 @@ journal_mount(mp, fs, cred)
> }
> jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
> jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
> - DIP_SET(ip, i_modrev, fs->fs_mtime);
> - ip->i_flags |= IN_MODIFIED;
> - ffs_update(vp, 1);
> + /*
> + * Only validate the journal contents if the filesystem is clean,
> + * otherwise we write the logs but they'll never be used. If the
> + * filesystem was still dirty when we mounted it the journal is
> + * invalid and a new journal can only be valid if it starts from a
> + * clean mount.
> + */
> + if (fs->fs_clean) {
> + DIP_SET(ip, i_modrev, fs->fs_mtime);
> + ip->i_flags |= IN_MODIFIED;
> + ffs_update(vp, 1);
> + }
> VFSTOUFS(mp)->softdep_jblocks = jblocks;
> out:
> vput(vp);
> @@ -2159,6 +2188,11 @@ remove_from_journal(wk)
> ump->softdep_on_journal -= 1;
> }
>
> +/*
> + * Check for journal space as well as dependency limits so the prelink
> + * code can throttle both journaled and non-journaled filesystems.
> + * Threshold is 0 for low and 1 for min.
> + */
> static int
> journal_space(ump, thresh)
> struct ufsmount *ump;
> @@ -2167,7 +2201,20 @@ journal_space(ump, thresh)
> struct jblocks *jblocks;
> int avail;
>
> + /*
> + * We use a tighter restriction here to prevent request_cleanup()
> + * running in threads from running into locks we currently hold.
> + */
> + if (num_inodedep > (max_softdeps / 10) * 9)
> + return (0);
> +
> jblocks = ump->softdep_jblocks;
> + if (jblocks == NULL)
> + return (1);
> + if (thresh)
> + thresh = jblocks->jb_min;
> + else
> + thresh = jblocks->jb_low;
> avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
> avail = jblocks->jb_free - avail;
>
> @@ -2210,15 +2257,13 @@ softdep_prealloc(vp, waitok)
> struct vnode *vp;
> int waitok;
> {
> - struct jblocks *jblocks;
> struct ufsmount *ump;
>
> if (DOINGSUJ(vp) == 0)
> return (0);
> ump = VFSTOUFS(vp->v_mount);
> - jblocks = ump->softdep_jblocks;
> ACQUIRE_LOCK(&lk);
> - if (journal_space(ump, jblocks->jb_low)) {
> + if (journal_space(ump, 0)) {
> FREE_LOCK(&lk);
> return (0);
> }
> @@ -2233,9 +2278,9 @@ softdep_prealloc(vp, waitok)
> ffs_syncvnode(vp, waitok);
> ACQUIRE_LOCK(&lk);
> process_removes(vp);
> - if (journal_space(ump, jblocks->jb_low) == 0) {
> + if (journal_space(ump, 0) == 0) {
> softdep_speedup();
> - if (journal_space(ump, jblocks->jb_min) == 0)
> + if (journal_space(ump, 1) == 0)
> journal_suspend(ump);
> }
> FREE_LOCK(&lk);
> @@ -2243,18 +2288,22 @@ softdep_prealloc(vp, waitok)
> return (0);
>
> *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
>
More information about the svn-src-projects
mailing list