svn commit: r348931 - in projects/fuse2: sys/fs/fuse tests/sys/fs/fusefs
Alan Somers
asomers at FreeBSD.org
Tue Jun 11 16:32:35 UTC 2019
Author: asomers
Date: Tue Jun 11 16:32:33 2019
New Revision: 348931
URL: https://svnweb.freebsd.org/changeset/base/348931
Log:
fusefs: WIP fixing writeback cacheing
The current "writeback" cache mode, selected by the
vfs.fusefs.data_cache_mode sysctl, doesn't do writeback cacheing at all. It
merely goes through the motions of using buf(9), but then writes every
buffer synchronously. This commit:
* Enables delayed writes when the sysctl is set to writeback cacheing
* Fixes a cache-coherency problem when extending a file whose last page has
just been written.
* Removes the "sync" mount option, which had been set unconditionally.
* Adjusts some SDT probes
* Adds several new tests that mimic what fsx does but with more control and
without a real file system. As I discover failures with fsx, I add
regression tests to this file.
* Adds a test that ensures we can append to a file without reading any data
from it.
This change is still incomplete. Clustered writing is not yet supported,
and there are frequent "panic: vm_fault_hold: fault on nofault entry" panics
that I need to fix.
Sponsored by: The FreeBSD Foundation
Added:
projects/fuse2/tests/sys/fs/fusefs/io.cc (contents, props changed)
Modified:
projects/fuse2/sys/fs/fuse/fuse_io.c
projects/fuse2/sys/fs/fuse/fuse_vfsops.c
projects/fuse2/tests/sys/fs/fusefs/Makefile
projects/fuse2/tests/sys/fs/fusefs/write.cc
Modified: projects/fuse2/sys/fs/fuse/fuse_io.c
==============================================================================
--- projects/fuse2/sys/fs/fuse/fuse_io.c Tue Jun 11 16:16:16 2019 (r348930)
+++ projects/fuse2/sys/fs/fuse/fuse_io.c Tue Jun 11 16:32:33 2019 (r348931)
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -265,9 +266,10 @@ out:
return (err);
}
-SDT_PROBE_DEFINE3(fusefs, , io, read_bio_backend_start, "int", "int", "int");
+SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int");
SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "int");
-SDT_PROBE_DEFINE3(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int");
+SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int",
+ "struct buf*");
static int
fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid)
@@ -297,9 +299,6 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize - 1);
- SDT_PROBE3(fusefs, , io, read_bio_backend_start,
- biosize, (int)lbn, on);
-
if ((off_t)lbn * biosize >= filesize) {
bcount = 0;
} else if ((off_t)(lbn + 1) * biosize > filesize) {
@@ -308,6 +307,9 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio
bcount = biosize;
}
+ SDT_PROBE4(fusefs, , io, read_bio_backend_start,
+ biosize, (int)lbn, on, bcount);
+
/* TODO: readahead. See ext2_read for an example */
err = bread(vp, lbn, bcount, NOCRED, &bp);
if (err) {
@@ -333,8 +335,8 @@ fuse_read_biobackend(struct vnode *vp, struct uio *uio
err = uiomove(bp->b_data + on, n, uio);
}
vfs_bio_brelse(bp, ioflag);
- SDT_PROBE3(fusefs, , io, read_bio_backend_end, err,
- uio->uio_resid, n);
+ SDT_PROBE4(fusefs, , io, read_bio_backend_end, err,
+ uio->uio_resid, n, bp);
}
return (err);
@@ -564,6 +566,7 @@ fuse_write_biobackend(struct vnode *vp, struct uio *ui
off_t filesize;
int bcount;
int n, on, err = 0;
+ bool last_page;
const int biosize = fuse_iosize(vp);
@@ -612,6 +615,11 @@ again:
extending = true;
bcount = on + n;
}
+ if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >=
+ howmany(filesize, PAGE_SIZE))
+ last_page = true;
+ else
+ last_page = false;
if (direct_append) {
/*
* Take care to preserve the buffer's B_CACHE state so
@@ -642,6 +650,8 @@ again:
break;
}
}
+ if (biosize > bcount)
+ vfs_bio_clrbuf(bp);
SDT_PROBE6(fusefs, , io, write_biobackend_start,
lbn, on, n, uio, bcount, direct_append);
@@ -689,7 +699,6 @@ again:
* If the chopping creates a reverse-indexed or degenerate
* situation with dirtyoff/end, we 0 both of them.
*/
-
if (bp->b_dirtyend > bcount) {
SDT_PROBE2(fusefs, , io, write_biobackend_append_race,
(long)bp->b_blkno * biosize,
@@ -738,6 +747,7 @@ again:
bp->b_error = err;
brelse(bp);
break;
+ /* TODO: vfs_bio_clrbuf like ffs_write does? */
}
/*
* Only update dirtyoff/dirtyend if not a degenerate
@@ -753,7 +763,42 @@ again:
}
vfs_bio_set_valid(bp, on, n);
}
- err = bwrite(bp);
+
+ vfs_bio_set_flags(bp, ioflag);
+
+ if (last_page) {
+ /*
+ * When writing the last page of a file we must write
+ * synchronously. If we didn't, then a subsequent
+ * operation could extend the file, making the last
+ * page of this buffer invalid because it would only be
+ * partially cached.
+ *
+ * As an optimization, it would be allowable to only
+ * write the last page synchronously. Or, it should be
+ * possible to synchronously flush the last
+ * already-written page whenever extending a file with
+ * ftruncate or another write.
+ */
+ err = bwrite(bp);
+ } else if (ioflag & IO_SYNC) {
+ err = bwrite(bp);
+ } else if (vm_page_count_severe() ||
+ buf_dirty_count_severe() ||
+ (ioflag & IO_ASYNC)) {
+ /* TODO: enable write clustering later */
+ bawrite(bp);
+ } else if (on == 0 && n == bcount) {
+ if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
+ bdwrite(bp);
+ else
+ bawrite(bp);
+ } else if (ioflag & IO_DIRECT) {
+ bawrite(bp);
+ } else {
+ bp->b_flags &= ~B_CLUSTEROK;
+ bdwrite(bp);
+ }
if (err)
break;
} while (uio->uio_resid > 0 && n > 0);
@@ -819,7 +864,7 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp)
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
- uiop->uio_offset = ((off_t)bp->b_blkno) * biosize;
+ uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize;
error = fuse_read_directbackend(vp, uiop, cred, fufh);
if (!error && uiop->uio_resid) {
@@ -854,14 +899,14 @@ fuse_io_strategy(struct vnode *vp, struct buf *bp)
return (error);
}
- if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > filesize)
+ if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize)
bp->b_dirtyend = filesize -
- (off_t)bp->b_blkno * biosize;
+ (off_t)bp->b_lblkno * biosize;
if (bp->b_dirtyend > bp->b_dirtyoff) {
io.iov_len = uiop->uio_resid = bp->b_dirtyend
- bp->b_dirtyoff;
- uiop->uio_offset = (off_t)bp->b_blkno * biosize
+ uiop->uio_offset = (off_t)bp->b_lblkno * biosize
+ bp->b_dirtyoff;
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
uiop->uio_rw = UIO_WRITE;
Modified: projects/fuse2/sys/fs/fuse/fuse_vfsops.c
==============================================================================
--- projects/fuse2/sys/fs/fuse/fuse_vfsops.c Tue Jun 11 16:16:16 2019 (r348930)
+++ projects/fuse2/sys/fs/fuse/fuse_vfsops.c Tue Jun 11 16:32:33 2019 (r348931)
@@ -314,9 +314,6 @@ fuse_vfsop_mount(struct mount *mp)
__mntopts = 0;
td = curthread;
- MNT_ILOCK(mp);
- mp->mnt_flag |= MNT_SYNCHRONOUS;
- MNT_IUNLOCK(mp);
/* Get the new options passed to mount */
opts = mp->mnt_optnew;
Modified: projects/fuse2/tests/sys/fs/fusefs/Makefile
==============================================================================
--- projects/fuse2/tests/sys/fs/fusefs/Makefile Tue Jun 11 16:16:16 2019 (r348930)
+++ projects/fuse2/tests/sys/fs/fusefs/Makefile Tue Jun 11 16:32:33 2019 (r348931)
@@ -21,6 +21,7 @@ GTESTS+= fsync
GTESTS+= fsyncdir
GTESTS+= getattr
GTESTS+= interrupt
+GTESTS+= io
GTESTS+= link
GTESTS+= locks
GTESTS+= lookup
Added: projects/fuse2/tests/sys/fs/fusefs/io.cc
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/fuse2/tests/sys/fs/fusefs/io.cc Tue Jun 11 16:32:33 2019 (r348931)
@@ -0,0 +1,276 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ *
+ * This software was developed by BFF Storage Systems, LLC under sponsorship
+ * from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+extern "C" {
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+}
+
+#include "mockfs.hh"
+#include "utils.hh"
+
+/*
+ * For testing I/O like fsx does, but deterministically and without a real
+ * underlying file system
+ *
+ * TODO: after fusefs gains the options to select cache mode for each mount
+ * point, run each of these tests for all cache modes.
+ */
+
+using namespace testing;
+
+const char FULLPATH[] = "mountpoint/some_file.txt";
+const char RELPATH[] = "some_file.txt";
+const uint64_t ino = 42;
+
+class Io: public FuseTest {
+public:
+int m_backing_fd, m_control_fd, m_test_fd;
+
+Io(): m_backing_fd(-1), m_control_fd(-1) {};
+
+void SetUp()
+{
+ m_backing_fd = open("backing_file", O_RDWR | O_CREAT | O_TRUNC);
+ if (m_backing_fd < 0)
+ FAIL() << strerror(errno);
+ m_control_fd = open("control", O_RDWR | O_CREAT | O_TRUNC);
+ if (m_control_fd < 0)
+ FAIL() << strerror(errno);
+ srandom(22'9'1982); // Seed with my birthday
+ FuseTest::SetUp();
+ if (IsSkipped())
+ return;
+
+ expect_lookup(RELPATH, ino, S_IFREG | 0644, 0, 1);
+ expect_open(ino, 0, 1);
+ EXPECT_CALL(*m_mock, process(
+ ResultOf([=](auto in) {
+ return (in.header.opcode == FUSE_WRITE &&
+ in.header.nodeid == ino);
+ }, Eq(true)),
+ _)
+ ).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) {
+ const char *buf = (const char*)in.body.bytes +
+ sizeof(struct fuse_write_in);
+ ssize_t isize = in.body.write.size;
+ off_t iofs = in.body.write.offset;
+
+ ASSERT_EQ(isize, pwrite(m_backing_fd, buf, isize, iofs))
+ << strerror(errno);
+ SET_OUT_HEADER_LEN(out, write);
+ out.body.write.size = isize;
+ })));
+ EXPECT_CALL(*m_mock, process(
+ ResultOf([=](auto in) {
+ return (in.header.opcode == FUSE_READ &&
+ in.header.nodeid == ino);
+ }, Eq(true)),
+ _)
+ ).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) {
+ ssize_t isize = in.body.write.size;
+ off_t iofs = in.body.write.offset;
+ void *buf = out.body.bytes;
+
+ ASSERT_LE(0, pread(m_backing_fd, buf, isize, iofs))
+ << strerror(errno);
+ out.header.len = sizeof(struct fuse_out_header) + isize;
+ })));
+ EXPECT_CALL(*m_mock, process(
+ ResultOf([=](auto in) {
+ uint32_t valid = FATTR_SIZE | FATTR_FH;
+ return (in.header.opcode == FUSE_SETATTR &&
+ in.header.nodeid == ino &&
+ in.body.setattr.valid == valid);
+ }, Eq(true)),
+ _)
+ ).WillRepeatedly(Invoke(ReturnImmediate([=](auto in, auto& out) {
+ ASSERT_EQ(0, ftruncate(m_backing_fd, in.body.setattr.size))
+ << strerror(errno);
+ SET_OUT_HEADER_LEN(out, attr);
+ out.body.attr.attr.ino = ino;
+ out.body.attr.attr.mode = S_IFREG | 0755;
+ out.body.attr.attr.size = in.body.setattr.size;
+ out.body.attr.attr_valid = UINT64_MAX;
+ })));
+ /* Any test that close()s will send FUSE_FLUSH and FUSE_RELEASE */
+ EXPECT_CALL(*m_mock, process(
+ ResultOf([=](auto in) {
+ return (in.header.opcode == FUSE_FLUSH &&
+ in.header.nodeid == ino);
+ }, Eq(true)),
+ _)
+ ).WillRepeatedly(Invoke(ReturnErrno(0)));
+ EXPECT_CALL(*m_mock, process(
+ ResultOf([=](auto in) {
+ return (in.header.opcode == FUSE_RELEASE &&
+ in.header.nodeid == ino);
+ }, Eq(true)),
+ _)
+ ).WillRepeatedly(Invoke(ReturnErrno(0)));
+
+ m_test_fd = open(FULLPATH, O_RDWR );
+ EXPECT_LE(0, m_test_fd) << strerror(errno);
+}
+
+void TearDown()
+{
+ if (m_backing_fd >= 0)
+ close(m_backing_fd);
+ if (m_control_fd >= 0)
+ close(m_control_fd);
+ FuseTest::TearDown();
+ /* Deliberately leak test_fd */
+}
+
+void do_ftruncate(off_t offs)
+{
+ ASSERT_EQ(0, ftruncate(m_test_fd, offs)) << strerror(errno);
+ ASSERT_EQ(0, ftruncate(m_control_fd, offs)) << strerror(errno);
+}
+
+void do_read(ssize_t size, off_t offs)
+{
+ void *test_buf, *control_buf;
+
+ test_buf = malloc(size);
+ ASSERT_NE(NULL, test_buf) << strerror(errno);
+ control_buf = malloc(size);
+ ASSERT_NE(NULL, control_buf) << strerror(errno);
+
+ ASSERT_EQ(size, pread(m_test_fd, test_buf, size, offs))
+ << strerror(errno);
+ ASSERT_EQ(size, pread(m_control_fd, control_buf, size, offs))
+ << strerror(errno);
+
+ ASSERT_EQ(0, memcmp(test_buf, control_buf, size));
+
+ free(control_buf);
+ free(test_buf);
+}
+
+void do_write(ssize_t size, off_t offs)
+{
+ char *buf;
+ long i;
+
+ buf = (char*)malloc(size);
+ ASSERT_NE(NULL, buf) << strerror(errno);
+ for (i=0; i < size; i++)
+ buf[i] = random();
+
+ ASSERT_EQ(size, pwrite(m_test_fd, buf, size, offs ))
+ << strerror(errno);
+ ASSERT_EQ(size, pwrite(m_control_fd, buf, size, offs))
+ << strerror(errno);
+}
+
+};
+
+/*
+ * Extend a file with dirty data in the last page of the last block.
+ *
+ * fsx -WR -P /tmp -S8 -N3 fsx.bin
+ */
+TEST_F(Io, extend_from_dirty_page)
+{
+ off_t wofs = 0x21a0;
+ ssize_t wsize = 0xf0a8;
+ off_t rofs = 0xb284;
+ ssize_t rsize = 0x9b22;
+ off_t truncsize = 0x28702;
+
+ do_write(wsize, wofs);
+ do_ftruncate(truncsize);
+ do_read(rsize, rofs);
+}
+
+/*
+ * When writing the last page of a file, it must be written synchronously.
+ * Otherwise the cached page can become invalid by a subsequent extend
+ * operation.
+ *
+ * fsx -WR -P /tmp -S642 -N3 fsx.bin
+ */
+TEST_F(Io, last_page)
+{
+ off_t wofs0 = 0x1134f;
+ ssize_t wsize0 = 0xcc77;
+ off_t wofs1 = 0x2096a;
+ ssize_t wsize1 = 0xdfa7;
+ off_t rofs = 0x1a3aa;
+ ssize_t rsize = 0xb5b7;
+
+ do_write(wsize0, wofs0);
+ do_write(wsize1, wofs1);
+ do_read(rsize, rofs);
+}
+
+/*
+ * Read a hole from a block that contains some cached data.
+ *
+ * fsx -WR -P /tmp -S55 fsx.bin
+ */
+TEST_F(Io, read_hole_from_cached_block)
+{
+ off_t wofs = 0x160c5;
+ ssize_t wsize = 0xa996;
+ off_t rofs = 0x472e;
+ ssize_t rsize = 0xd8d5;
+
+ do_write(wsize, wofs);
+ do_read(rsize, rofs);
+}
+
+/*
+ * Reliable panic; I don't yet know why.
+ * Disabled because it panics.
+ *
+ * fsx -WR -P /tmp -S839 -d -N6 fsx.bin
+ */
+TEST_F(Io, DISABLED_fault_on_nofault_entry)
+{
+ off_t wofs0 = 0x3bad7;
+ ssize_t wsize0 = 0x4529;
+ off_t wofs1 = 0xc30d;
+ ssize_t wsize1 = 0x5f77;
+ off_t truncsize0 = 0x10916;
+ off_t rofs = 0xdf17;
+ ssize_t rsize = 0x29ff;
+ off_t truncsize1 = 0x152b4;
+
+ do_write(wsize0, wofs0);
+ do_write(wsize1, wofs1);
+ do_ftruncate(truncsize0);
+ do_read(rsize, rofs);
+ do_ftruncate(truncsize1);
+ close(m_test_fd);
+}
Modified: projects/fuse2/tests/sys/fs/fusefs/write.cc
==============================================================================
--- projects/fuse2/tests/sys/fs/fusefs/write.cc Tue Jun 11 16:16:16 2019 (r348930)
+++ projects/fuse2/tests/sys/fs/fusefs/write.cc Tue Jun 11 16:32:33 2019 (r348931)
@@ -249,6 +249,44 @@ TEST_F(Write, append)
/* Deliberately leak fd. close(2) will be tested in release.cc */
}
+/* If a file is cached, then appending to the end should not cause a read */
+TEST_F(Write, append_to_cached)
+{
+ const ssize_t BUFSIZE = 9;
+ const char FULLPATH[] = "mountpoint/some_file.txt";
+ const char RELPATH[] = "some_file.txt";
+ char *oldcontents, *oldbuf;
+ const char CONTENTS[BUFSIZE] = "abcdefgh";
+ uint64_t ino = 42;
+ /*
+ * Set offset in between maxbcachebuf boundary to test buffer handling
+ */
+ uint64_t oldsize = m_maxbcachebuf / 2;
+ int fd;
+
+ oldcontents = (char*)calloc(1, oldsize);
+ ASSERT_NE(NULL, oldcontents) << strerror(errno);
+ oldbuf = (char*)malloc(oldsize);
+ ASSERT_NE(NULL, oldbuf) << strerror(errno);
+
+ expect_lookup(RELPATH, ino, oldsize);
+ expect_open(ino, 0, 1);
+ expect_read(ino, 0, oldsize, oldsize, oldcontents);
+ expect_write(ino, oldsize, BUFSIZE, BUFSIZE, CONTENTS);
+
+ /* Must open O_RDWR or fuse(4) implicitly sets direct_io */
+ fd = open(FULLPATH, O_RDWR | O_APPEND);
+ EXPECT_LE(0, fd) << strerror(errno);
+
+ /* Read the old data into the cache */
+ ASSERT_EQ((ssize_t)oldsize, read(fd, oldbuf, oldsize))
+ << strerror(errno);
+
+ /* Write the new data. There should be no more read operations */
+ ASSERT_EQ(BUFSIZE, write(fd, CONTENTS, BUFSIZE)) << strerror(errno);
+ /* Deliberately leak fd. close(2) will be tested in release.cc */
+}
+
TEST_F(Write, append_direct_io)
{
const ssize_t BUFSIZE = 9;
@@ -789,7 +827,7 @@ TEST_F(WriteThrough, DISABLED_writethrough)
EXPECT_LE(0, fd) << strerror(errno);
ASSERT_EQ(bufsize, write(fd, CONTENTS, bufsize)) << strerror(errno);
- /*
+ /*
* A subsequent read should be serviced by cache, without querying the
* filesystem daemon
*/
More information about the svn-src-projects
mailing list