===next3_snapshot_hooks_data.patch===
next3: snapshot hooks - move data blocks
Before every regular file data buffer write,
the function next3_get_block() is called to map the buffer to disk.
We use this hook to call the snapshot API snapshot_get_move_access(),
to optionally move the block to the snapshot file.
Signed-off-by: Amir Goldstein <amir73il@users.sf.net>
---
inode.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
snapshot.h | 47 ++++++++++++
2 files changed, 257 insertions(+), 3 deletions(-)
@@ -827,6 +827,43 @@ int next3_get_blocks_handle(handle_t *ha
partial = next3_get_branch(inode, depth, offsets, chain, &err);
+ if (!partial && create && buffer_move_data(bh_result)) {
+ BUG_ON(!next3_snapshot_should_move_data(inode));
+ first_block = le32_to_cpu(chain[depth - 1].key);
+ blocks_to_boundary = 0;
+ /* should move 1 data block to snapshot? */
+ err = next3_snapshot_get_move_access(handle, inode,
+ first_block, 0);
+ if (err)
+ /* do not map found block */
+ partial = chain + depth - 1;
+ if (err < 0)
+ /* cleanup the whole chain and exit */
+ goto cleanup;
+ if (buffer_direct_io(bh_result)) {
+ /* suppress direct I/O write to block that
needs to be moved */
+ err = 0;
+ goto cleanup;
+ }
+ if (err > 0)
+ /* check again under truncate_mutex */
+ err = -EAGAIN;
+ }
+ if (partial && create && buffer_direct_io(bh_result)) {
+ /* suppress direct I/O write to holes */
+ loff_t end = ((iblock + maxblocks - 1) << inode->i_blkbits) + 1;
+ /*
+ * we do not know the original write length, but it
has to be at least
+ * 1 byte into the last requested block. if the
minimal length write
+ * isn't going to extend i_size, we must be cautious
and assume that
+ * direct I/O is async and refuse to fill the hole.
+ */
+ if (end <= inode->i_size) {
+ err = 0;
+ goto cleanup;
+ }
+ }
+
/* Simplest case - block found, no allocation needed */
if (!partial) {
first_block = le32_to_cpu(chain[depth - 1].key);
@@ -883,6 +920,20 @@ int next3_get_blocks_handle(handle_t *ha
partial--;
}
partial = next3_get_branch(inode, depth, offsets, chain, &err);
+ if (!partial && buffer_move_data(bh_result)) {
+ BUG_ON(!next3_snapshot_should_move_data(inode));
+ first_block = le32_to_cpu(chain[depth - 1].key);
+ blocks_to_boundary = 0;
+ /* should move 1 data block to snapshot? */
+ err = next3_snapshot_get_move_access(handle, inode,
+ first_block, 0);
+ if (err)
+ /* re-allocate 1 data block */
+ partial = chain + depth - 1;
+ if (err < 0)
+ /* cleanup the whole chain and exit */
+ goto out_mutex;
+ }
if (!partial) {
count++;
mutex_unlock(&ei->truncate_mutex);
@@ -919,6 +970,43 @@ int next3_get_blocks_handle(handle_t *ha
if (err)
goto out_mutex;
+ if (*(partial->p)) {
+ int ret;
+
+ /* old block is being replaced with a new block */
+ if (buffer_partial_write(bh_result) &&
+ !buffer_uptodate(bh_result)) {
+ /* read old block data before moving it to snapshot */
+ map_bh(bh_result, inode->i_sb,
+ le32_to_cpu(*(partial->p)));
+ ll_rw_block(READ, 1, &bh_result);
+ wait_on_buffer(bh_result);
+ /* clear old block mapping */
+ clear_buffer_mapped(bh_result);
+ if (!buffer_uptodate(bh_result)) {
+ err = -EIO;
+ goto out_mutex;
+ }
+ }
+
+ if (buffer_partial_write(bh_result))
+ /* prevent zero out of page in block_write_begin() */
+ SetPageUptodate(bh_result->b_page);
+
+ /* move old block to snapshot */
+ ret = next3_snapshot_get_move_access(handle, inode,
+ le32_to_cpu(*(partial->p)), 1);
+ if (ret < 1) {
+ /* failed to move to snapshot - free new block */
+ next3_free_blocks(handle, inode,
+ le32_to_cpu(partial->key), 1);
+ err = ret ? : -EIO;
+ goto out_mutex;
+ }
+ /* block moved to snapshot - continue to splice new block */
+ err = 0;
+ }
+
/*
* The next3_splice_branch call will free and forget any buffers
* on the new chain if there is a failure, but that risks using
@@ -981,6 +1069,13 @@ static int next3_get_block(struct inode
goto out;
}
started = 1;
+ /*
+ * signal next3_get_blocks_handle() to return unmapped
block if block
+ * is not allocated or if it needs to be moved to snapshot.
+ */
+ set_buffer_direct_io(bh_result);
+ if (next3_snapshot_should_move_data(inode))
+ set_buffer_move_data(bh_result);
}
ret = next3_get_blocks_handle(handle, inode, iblock,
@@ -1166,6 +1261,71 @@ static void next3_truncate_failed_write(
next3_truncate(inode);
}
+/*
+ * Check if a buffer was written since the last snapshot was taken.
+ * In data=ordered, the only mode supported by next3, all dirty data buffers
+ * are flushed on snapshot take via freeze_fs() API, so buffer_jbd(bh) means
+ * that, the buffer was declared dirty data after snapshot take.
+ */
+static int buffer_first_write(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_jbd(bh);
+}
+
+static int set_move_data(handle_t *handle, struct buffer_head *bh)
+{
+ BUG_ON(buffer_move_data(bh));
+ clear_buffer_mapped(bh);
+ set_buffer_move_data(bh);
+ return 0;
+}
+
+static int set_partial_write(handle_t *handle, struct buffer_head *bh)
+{
+ BUG_ON(buffer_partial_write(bh));
+ set_buffer_partial_write(bh);
+ return 0;
+}
+
+static void set_page_move_data(struct page *page, unsigned from, unsigned to)
+{
+ struct buffer_head *page_bufs = page_buffers(page);
+
+ BUG_ON(!page_has_buffers(page));
+ /*
+ * make sure that get_block() is called even for mapped buffers,
+ * but not if all buffers were written since last snapshot take.
+ */
+ if (walk_page_buffers(NULL, page_bufs, from, to,
+ NULL, buffer_first_write)) {
+ /* signal get_block() to move-on-write */
+ walk_page_buffers(NULL, page_bufs, from, to,
+ NULL, set_move_data);
+ if (from > 0 || to < PAGE_CACHE_SIZE)
+ /* signal get_block() to update page before
move-on-write */
+ walk_page_buffers(NULL, page_bufs, from, to,
+ NULL, set_partial_write);
+ }
+}
+
+static int clear_move_data(handle_t *handle, struct buffer_head *bh)
+{
+ clear_buffer_partial_write(bh);
+ clear_buffer_move_data(bh);
+ return 0;
+}
+
+static void clear_page_move_data(struct page *page)
+{
+ /*
+ * partial_write/move_data flags are used to pass the move data block
+ * request to next3_get_block() and should be cleared at all
other times.
+ */
+ BUG_ON(!page_has_buffers(page));
+ walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE,
+ NULL, clear_move_data);
+}
+
static int next3_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1198,8 +1358,21 @@ retry:
ret = PTR_ERR(handle);
goto out;
}
+ /*
+ * only data=ordered mode is supported with snapshots, so the
+ * buffer heads are going to be attached sooner or later anyway.
+ */
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
+ /*
+ * Check if blocks need to be moved-on-write. if they do, unmap buffers
+ * and call block_write_begin() to remap them.
+ */
+ if (next3_snapshot_should_move_data(inode))
+ set_page_move_data(page, from, to);
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
next3_get_block);
+ clear_page_move_data(page);
if (ret)
goto write_begin_failed;
@@ -1546,6 +1719,12 @@ static int next3_ordered_writepage(struc
(1 << BH_Dirty)|(1 << BH_Uptodate));
page_bufs = page_buffers(page);
} else {
+ /*
+ * Check if blocks need to be moved-on-write. if they
do, unmap buffers
+ * and fall through to get_block() path.
+ */
+ if (next3_snapshot_should_move_data(inode))
+ set_page_move_data(page, 0, PAGE_CACHE_SIZE);
page_bufs = page_buffers(page);
if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, buffer_unmapped)) {
@@ -1565,6 +1744,7 @@ static int next3_ordered_writepage(struc
PAGE_CACHE_SIZE, NULL, bget_one);
ret = block_write_full_page(page, next3_get_block, wbc);
+ clear_page_move_data(page);
/*
* The page can become unlocked at any point now, and
@@ -1754,6 +1934,19 @@ static ssize_t next3_direct_IO(int rw, s
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
int retries = 0;
+ int flags;
+
+ /*
+ * suppress DIO_SKIP_HOLES to make sure that direct I/O writes
always call
+ * next3_get_block() with create=1, so that we can fall back to buffered
+ * I/O when data blocks need to be moved to snapshot.
+ */
+ if (NEXT3_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ NEXT3_FEATURE_RO_COMPAT_HAS_SNAPSHOT))
+ flags = DIO_LOCKING;
+ else
+ flags = DIO_LOCKING | DIO_SKIP_HOLES;
+
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -1776,9 +1969,8 @@ static ssize_t next3_direct_IO(int rw, s
}
retry:
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- next3_get_block, NULL);
+ ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+ offset, nr_segs, next3_get_block, NULL, NULL, flags);
if (ret == -ENOSPC && next3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -1964,6 +2156,21 @@ static int next3_block_truncate_page(han
goto unlock;
}
+ /* check if block needs to be moved to snapshot before zeroing */
+ if (next3_snapshot_should_move_data(inode) &&
+ buffer_first_write(NULL, bh)) {
+ set_buffer_move_data(bh);
+ err = next3_get_block(inode, iblock, bh, 1);
+ clear_buffer_move_data(bh);
+ if (err)
+ goto unlock;
+ if (buffer_new(bh)) {
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ clear_buffer_new(bh);
+ }
+ }
+
if (next3_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
err = next3_journal_get_write_access(handle, bh);
@@ -97,6 +97,15 @@
#define SNAPSHOT_SET_DISABLED(inode) \
i_size_write((inode), 0)
+enum next3_bh_state_bits {
+ BH_Partial_Write = 29, /* Buffer should be uptodate before write */
+ BH_Direct_IO = 30, /* Buffer is under direct I/O */
+ BH_Move_Data = 31, /* Data block may need to be
moved-on-write */
+};
+
+BUFFER_FNS(Partial_Write, partial_write)
+BUFFER_FNS(Direct_IO, direct_io)
+BUFFER_FNS(Move_Data, move_data)