svn commit: r304239 - head/sys/ufs/ffs
Kirk McKusick
mckusick at FreeBSD.org
Tue Aug 16 21:02:31 UTC 2016
Author: mckusick
Date: Tue Aug 16 21:02:30 2016
New Revision: 304239
URL: https://svnweb.freebsd.org/changeset/base/304239
Log:
Bug 211013 reports that a write error to a UFS filesystem running
with softupdates panics the kernel. The problem that has been pointed
out is that when there is a transient write error on certain metadata
blocks, specifically directory blocks (PAGEDEP), inode blocks
(INODEDEP), indirect pointer blocks (INDIRDEPS), and cylinder group
(BMSAFEMAP, but only when journaling is enabled), we get a panic
in one of the routines called by softdep_disk_io_initiation that
the I/O is "already started" when we retry the write.
These dependency types potentially need to do roll-backs when called
by softdep_disk_io_initiation before doing a write and then a
roll-forward when called by softdep_disk_write_complete after the
I/O completes. The panic happens when there is a transient error.
At the top of softdep_disk_write_complete we check to see if the
write had an error and if an error occurred we just return. This
return is correct most of the time because the main role of the routines
called by softdep_disk_write_complete is to process the now-completed
dependencies so that the next I/O steps can happen.
But for the four types listed above, they do not get to do their
rollback operations. This causes the panic when softdep_disk_io_initiation
gets called on the second attempt to do the write and the roll-back
routines find that the roll-backs have already been done. As an
aside I note that there is also the problem that the buffer will
have been unlocked and thus made visible to the filesystem and to
user applications with the roll-backs in place.
The way to resolve the problem is to add a flag to the routines called
by softdep_disk_write_complete for the four dependency types noted
that indicates whether the write was successful (WRITESUCCEEDED).
If the write does not succeed, they do just the roll-backs and then
return. If the write was successful they also do their usual
processing of the now-completed dependencies.
The fix was tested by selectively injecting write errors for buffers
holding dependencies of each of the four types noted above and then
verifying that the kernel no longer paniced and that following the
successful retry of the write that the filesystem could be unmounted
and successfully checked cleanly.
PR: 211013
Reviewed by: kib
Modified:
head/sys/ufs/ffs/ffs_softdep.c
head/sys/ufs/ffs/softdep.h
Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c Tue Aug 16 20:35:36 2016 (r304238)
+++ head/sys/ufs/ffs/ffs_softdep.c Tue Aug 16 21:02:30 2016 (r304239)
@@ -752,16 +752,16 @@ static int flush_newblk_dep(struct vnode
static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
static int flush_deplist(struct allocdirectlst *, int, int *);
static int sync_cgs(struct mount *, int);
-static int handle_written_filepage(struct pagedep *, struct buf *);
+static int handle_written_filepage(struct pagedep *, struct buf *, int);
static int handle_written_sbdep(struct sbdep *, struct buf *);
static void initiate_write_sbdep(struct sbdep *);
static void diradd_inode_written(struct diradd *, struct inodedep *);
static int handle_written_indirdep(struct indirdep *, struct buf *,
- struct buf**);
-static int handle_written_inodeblock(struct inodedep *, struct buf *);
+ struct buf**, int);
+static int handle_written_inodeblock(struct inodedep *, struct buf *, int);
static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
uint8_t *);
-static int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
+static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
static void handle_written_jaddref(struct jaddref *);
static void handle_written_jremref(struct jremref *);
static void handle_written_jseg(struct jseg *, struct buf *);
@@ -10874,6 +10874,10 @@ initiate_write_bmsafemap(bmsafemap, bp)
struct fs *fs;
ino_t ino;
+ /*
+ * If this is a background write, we did this at the time that
+ * the copy was made, so do not need to do it again.
+ */
if (bmsafemap->sm_state & IOSTARTED)
return;
bmsafemap->sm_state |= IOSTARTED;
@@ -10947,10 +10951,39 @@ softdep_disk_write_complete(bp)
/*
* If an error occurred while doing the write, then the data
- * has not hit the disk and the dependencies cannot be unrolled.
+ * has not hit the disk and the dependencies cannot be processed.
+ * But we do have to go through and roll forward any dependencies
+ * that were rolled back before the disk write.
*/
- if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
+ if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
+ LIST_FOREACH(wk, &bp->b_dep, wk_list) {
+ switch (wk->wk_type) {
+
+ case D_PAGEDEP:
+ handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
+ continue;
+
+ case D_INODEDEP:
+ handle_written_inodeblock(WK_INODEDEP(wk),
+ bp, 0);
+ continue;
+
+ case D_BMSAFEMAP:
+ handle_written_bmsafemap(WK_BMSAFEMAP(wk),
+ bp, 0);
+ continue;
+
+ case D_INDIRDEP:
+ handle_written_indirdep(WK_INDIRDEP(wk),
+ bp, &sbp, 0);
+ continue;
+ default:
+ /* nothing to roll forward */
+ continue;
+ }
+ }
return;
+ }
if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
return;
ump = VFSTOUFS(wk->wk_mp);
@@ -10970,17 +11003,20 @@ softdep_disk_write_complete(bp)
switch (wk->wk_type) {
case D_PAGEDEP:
- if (handle_written_filepage(WK_PAGEDEP(wk), bp))
+ if (handle_written_filepage(WK_PAGEDEP(wk), bp,
+ WRITESUCCEEDED))
WORKLIST_INSERT(&reattach, wk);
continue;
case D_INODEDEP:
- if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
+ if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
+ WRITESUCCEEDED))
WORKLIST_INSERT(&reattach, wk);
continue;
case D_BMSAFEMAP:
- if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
+ if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
+ WRITESUCCEEDED))
WORKLIST_INSERT(&reattach, wk);
continue;
@@ -10999,7 +11035,8 @@ softdep_disk_write_complete(bp)
continue;
case D_INDIRDEP:
- if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
+ if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
+ WRITESUCCEEDED))
WORKLIST_INSERT(&reattach, wk);
continue;
@@ -11299,12 +11336,17 @@ handle_bufwait(inodedep, refhd)
* Called from within softdep_disk_write_complete above to restore
* in-memory inode block contents to their most up-to-date state. Note
* that this routine is always called from interrupt level with further
- * splbio interrupts blocked.
+ * interrupts from this device blocked.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
*/
static int
-handle_written_inodeblock(inodedep, bp)
+handle_written_inodeblock(inodedep, bp, flags)
struct inodedep *inodedep;
struct buf *bp; /* buffer containing the inode block */
+ int flags;
{
struct freefile *freefile;
struct allocdirect *adp, *nextadp;
@@ -11334,7 +11376,8 @@ handle_written_inodeblock(inodedep, bp)
/*
* Leave this inodeblock dirty until it's in the list.
*/
- if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
+ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
+ (flags & WRITESUCCEEDED)) {
struct inodedep *inon;
inon = TAILQ_NEXT(inodedep, id_unlinked);
@@ -11373,7 +11416,8 @@ handle_written_inodeblock(inodedep, bp)
goto bufwait;
return (1);
}
- inodedep->id_state |= COMPLETE;
+ if (flags & WRITESUCCEEDED)
+ inodedep->id_state |= COMPLETE;
/*
* Roll forward anything that had to be rolled back before
* the inode could be updated.
@@ -11488,6 +11532,13 @@ handle_written_inodeblock(inodedep, bp)
bdirty(bp);
bufwait:
/*
+ * If the write did not succeed, we have done all the roll-forward
+ * operations, but we cannot take the actions that will allow its
+ * dependencies to be processed.
+ */
+ if ((flags & WRITESUCCEEDED) == 0)
+ return (hadchanges);
+ /*
* Process any allocdirects that completed during the update.
*/
if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
@@ -11544,11 +11595,20 @@ bufwait:
return (hadchanges);
}
+/*
+ * Perform needed roll-forwards and kick off any dependencies that
+ * can now be processed.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
+ */
static int
-handle_written_indirdep(indirdep, bp, bpp)
+handle_written_indirdep(indirdep, bp, bpp, flags)
struct indirdep *indirdep;
struct buf *bp;
struct buf **bpp;
+ int flags;
{
struct allocindir *aip;
struct buf *sbp;
@@ -11573,6 +11633,16 @@ handle_written_indirdep(indirdep, bp, bp
indirdep->ir_state &= ~(UNDONE | IOSTARTED);
indirdep->ir_state |= ATTACHED;
/*
+ * If the write did not succeed, we have done all the roll-forward
+ * operations, but we cannot take the actions that will allow its
+ * dependencies to be processed.
+ */
+ if ((flags & WRITESUCCEEDED) == 0) {
+ stat_indir_blk_ptrs++;
+ bdirty(bp);
+ return (1);
+ }
+ /*
* Move allocindirs with written pointers to the completehd if
* the indirdep's pointer is not yet written. Otherwise
* free them here.
@@ -11726,11 +11796,16 @@ jnewblk_rollforward(jnewblk, fs, cgp, bl
* Complete a write to a bmsafemap structure. Roll forward any bitmap
* changes if it's not a background write. Set all written dependencies
* to DEPCOMPLETE and free the structure if possible.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
*/
static int
-handle_written_bmsafemap(bmsafemap, bp)
+handle_written_bmsafemap(bmsafemap, bp, flags)
struct bmsafemap *bmsafemap;
struct buf *bp;
+ int flags;
{
struct newblk *newblk;
struct inodedep *inodedep;
@@ -11746,15 +11821,20 @@ handle_written_bmsafemap(bmsafemap, bp)
int chgs;
if ((bmsafemap->sm_state & IOSTARTED) == 0)
- panic("initiate_write_bmsafemap: Not started\n");
+ panic("handle_written_bmsafemap: Not started\n");
ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
chgs = 0;
bmsafemap->sm_state &= ~IOSTARTED;
foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
/*
- * Release journal work that was waiting on the write.
+ * If write was successful, release journal work that was waiting
+ * on the write. Otherwise move the work back.
*/
- handle_jwork(&bmsafemap->sm_freewr);
+ if (flags & WRITESUCCEEDED)
+ handle_jwork(&bmsafemap->sm_freewr);
+ else
+ LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
+ worklist, wk_list);
/*
* Restore unwritten inode allocation pending jaddref writes.
@@ -11804,6 +11884,20 @@ handle_written_bmsafemap(bmsafemap, bp)
free_jnewblk(jnewblk);
}
}
+ /*
+ * If the write did not succeed, we have done all the roll-forward
+ * operations, but we cannot take the actions that will allow its
+ * dependencies to be processed.
+ */
+ if ((flags & WRITESUCCEEDED) == 0) {
+ LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
+ newblk, nb_deps);
+ LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
+ worklist, wk_list);
+ if (foreground)
+ bdirty(bp);
+ return (1);
+ }
while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
newblk->nb_state |= DEPCOMPLETE;
newblk->nb_state &= ~ONDEPLIST;
@@ -11907,12 +12001,17 @@ free_pagedep(pagedep)
* A write operation was just completed. Removed inodes can
* now be freed and associated block pointers may be committed.
* Note that this routine is always called from interrupt level
- * with further splbio interrupts blocked.
+ * with further interrupts from this device blocked.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
*/
static int
-handle_written_filepage(pagedep, bp)
+handle_written_filepage(pagedep, bp, flags)
struct pagedep *pagedep;
struct buf *bp; /* buffer containing the written page */
+ int flags;
{
struct dirrem *dirrem;
struct diradd *dap, *nextdap;
@@ -11922,6 +12021,8 @@ handle_written_filepage(pagedep, bp)
if ((pagedep->pd_state & IOSTARTED) == 0)
panic("handle_written_filepage: not started");
pagedep->pd_state &= ~IOSTARTED;
+ if ((flags & WRITESUCCEEDED) == 0)
+ goto rollforward;
/*
* Process any directory removals that have been committed.
*/
@@ -11941,6 +12042,7 @@ handle_written_filepage(pagedep, bp)
if ((pagedep->pd_state & NEWBLOCK) == 0)
while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
free_diradd(dap, NULL);
+rollforward:
/*
* Uncommitted directory entries must be restored.
*/
@@ -11973,7 +12075,7 @@ handle_written_filepage(pagedep, bp)
* marked dirty so that its will eventually get written back in
* its correct form.
*/
- if (chgs) {
+ if (chgs || (flags & WRITESUCCEEDED) == 0) {
if ((bp->b_flags & B_DELWRI) == 0)
stat_dir_entry++;
bdirty(bp);
Modified: head/sys/ufs/ffs/softdep.h
==============================================================================
--- head/sys/ufs/ffs/softdep.h Tue Aug 16 20:35:36 2016 (r304238)
+++ head/sys/ufs/ffs/softdep.h Tue Aug 16 21:02:30 2016 (r304239)
@@ -140,6 +140,7 @@
#define UNLINKPREV 0x100000 /* inodedep is pointed at in the unlink list */
#define UNLINKONLIST 0x200000 /* inodedep is in the unlinked list on disk */
#define UNLINKLINKS (UNLINKNEXT | UNLINKPREV)
+#define WRITESUCCEEDED 0x400000 /* the disk write completed successfully */
#define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE)
More information about the svn-src-head
mailing list