@@ -5,7 +5,8 @@
obj-$(CONFIG_EXT3_FS) += ext3.o
ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
+ ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o \
+ ordered-data.o
ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
@@ -38,6 +38,7 @@
#include <linux/bio.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
+#include <linux/workqueue.h>
#include "xattr.h"
#include "acl.h"
@@ -204,6 +205,13 @@ void ext3_delete_inode (struct inode * inode)
if (IS_SYNC(inode))
handle->h_sync = 1;
inode->i_size = 0;
+
+ /*
+ * make sure we clean up any ordered extents that didn't get
+ * IO started on them because i_size shrunk down to zero.
+ */
+ ext3_truncate_ordered_extents(inode, 0);
+
if (inode->i_blocks)
ext3_truncate(inode);
/*
@@ -767,6 +775,24 @@ err_out:
}
/*
+ * This protects the disk i_size with the spinlock for the ordered
+ * extent tree. It returns 1 when the inode needs to be logged
+ * because the i_disksize has been updated.
+ */
+static int maybe_update_disk_isize(struct inode *inode, loff_t new_size)
+{
+ int ret = 0;
+
+ ext3_ordered_lock(inode);
+ if (EXT3_I(inode)->i_disksize < new_size) {
+ EXT3_I(inode)->i_disksize = new_size;
+ ret = 1;
+ }
+ ext3_ordered_unlock(inode);
+ return ret;
+}
+
+/*
* Allocation strategy is simple: if we have to allocate something, we will
* have to go the whole way to leaf. So let's do it before attaching anything
* to tree, set linkage between the newborn blocks, write them if sync is
@@ -815,6 +841,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
if (!partial) {
first_block = le32_to_cpu(chain[depth - 1].key);
clear_buffer_new(bh_result);
+ clear_buffer_datanew(bh_result);
count++;
/*map more blocks*/
while (count < maxblocks && count <= blocks_to_boundary) {
@@ -873,6 +900,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
if (err)
goto cleanup;
clear_buffer_new(bh_result);
+ clear_buffer_datanew(bh_result);
goto got_it;
}
}
@@ -915,14 +943,19 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
* i_disksize growing is protected by truncate_mutex. Don't forget to
* protect it if you're about to implement concurrent
* ext3_get_block() -bzzz
+ *
+ * FIXME, I think this only needs to extend the disk i_size when
+ * we're filling holes that came from using ftruncate to increase
+ * i_size. Need to verify.
*/
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
+ if (!ext3_should_guard_data(inode) && !err && extend_disksize)
+ maybe_update_disk_isize(inode, inode->i_size);
mutex_unlock(&ei->truncate_mutex);
if (err)
goto cleanup;
set_buffer_new(bh_result);
+ set_buffer_datanew(bh_result);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
@@ -1079,6 +1112,126 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
return NULL;
}
+/*
+ * after a data=guarded IO is done, we need to update the
+ * disk i_size to reflect the data we've written. If there are
+ * no more ordered data extents left in the tree, we need to
+ * get rid of the orphan entry making sure the file's
+ * block pointers match the i_size after a crash
+ */
+static void log_inode_ordered_end(struct inode *inode)
+{
+ handle_t *handle;
+ int orphan_cleanup;
+
+ handle = ext3_journal_start(inode, 3);
+
+ /*
+ * uhoh, should we flag the FS as readonly here? ext3_dirty_inode
+ * doesn't, which is what we're modeling ourselves after
+ */
+ if (IS_ERR(handle))
+ return;
+
+ /*
+ * we don't add orphan while cleaning up orphans, it kills
+ * orphans
+ */
+ orphan_cleanup = EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS;
+ ext3_ordered_lock(inode);
+ if (!orphan_cleanup && inode->i_nlink &&
+ RB_EMPTY_ROOT(&EXT3_I(inode)->ordered_tree.tree)) {
+ /* this also ends up writing our new disk i_size */
+ ext3_ordered_unlock(inode);
+ ext3_orphan_del(handle, inode);
+
+ /*
+ * now we check again to see if we might have dropped
+ * the orphan just after someone added a new ordered extent
+ */
+ ext3_ordered_lock(inode);
+ if (!RB_EMPTY_ROOT(&EXT3_I(inode)->ordered_tree.tree) &&
+ list_empty(&EXT3_I(inode)->i_orphan)) {
+ ext3_ordered_unlock(inode);
+ ext3_orphan_add(handle, inode);
+ } else {
+ ext3_ordered_unlock(inode);
+ }
+ } else {
+ ext3_ordered_unlock(inode);
+ ext3_mark_inode_dirty(handle, inode);
+ }
+ ext3_journal_stop(handle);
+}
+
+/*
+ * data=guarded updates are handled in a workqueue after the IO
+ * is done. This runs through the list of buffer heads that are pending
+ * processing.
+ */
+void ext3_run_guarded_work(struct work_struct *work)
+{
+ struct ext3_sb_info *sbi =
+ container_of(work, struct ext3_sb_info, guarded_work);
+ struct buffer_head *bh;
+ struct ext3_ordered_extent *ordered;
+ struct inode *inode;
+ struct page *page;
+ int must_log;
+
+ spin_lock_irq(&sbi->guarded_lock);
+ while(!list_empty(&sbi->guarded_buffers)) {
+ ordered = list_entry(sbi->guarded_buffers.next,
+ struct ext3_ordered_extent, list);
+ bh = ordered->bh;
+ must_log = 0;
+
+ WARN_ON(!buffer_dataguarded(bh));
+
+ /* we don't need a reference on the buffer head because
+ * it is locked until the end_io handler his called.
+ *
+ * This means the page can't go away, which means the
+ * inode can't go away
+ */
+ list_del(&ordered->list);
+ spin_unlock_irq(&sbi->guarded_lock);
+
+ page = bh->b_page;
+ inode = page->mapping->host;
+
+ ext3_ordered_lock(inode);
+ if (ordered->bh) {
+ /*
+ * someone might have decided this buffer didn't
+ * really need to be ordered and removed us from
+ * the rb tree. They set ordered->bh to null
+ * when that happens.
+ */
+ must_log = ext3_ordered_update_i_size(inode, ordered);
+ ext3_remove_ordered_extent(inode, ordered);
+ }
+ ext3_ordered_unlock(inode);
+
+ /*
+ * drop the reference taken when this ordered extent was
+ * put onto the guarded_buffers list
+ */
+ ext3_put_ordered_extent(ordered);
+
+ if (must_log > 0)
+ log_inode_ordered_end(inode);
+
+ /*
+ * finally, call the real bh end_io function to do
+ * all the hard work of maintaining page writeback.
+ */
+ end_buffer_async_write(bh, buffer_uptodate(bh));
+ spin_lock_irq(&sbi->guarded_lock);
+ }
+ spin_unlock_irq(&sbi->guarded_lock);
+}
+
static int walk_page_buffers( handle_t *handle,
struct buffer_head *head,
unsigned from,
@@ -1144,6 +1297,41 @@ static int do_journal_get_write_access(handle_t *handle,
return ext3_journal_get_write_access(handle, bh);
}
+/*
+ * this is used during write_begin to find all the buffers that
+ * must be protected by the data=guarded code
+ */
+static int walk_page_for_guard(handle_t *handle, struct buffer_head *bh)
+{
+ struct inode *inode = bh->b_page->mapping->host;
+ u64 offset = page_offset(bh->b_page) + bh_offset(bh);
+ int ret;
+
+ /*
+ * we can trust buffer_new during write_begin,
+ * and we only guard new buffers
+ */
+ if (!buffer_mapped(bh) || buffer_freed(bh) || !buffer_new(bh) ||
+ offset + bh->b_size <= EXT3_I(inode)->i_disksize)
+ return 0;
+
+ ret = ext3_add_ordered_extent(inode, offset, bh);
+
+ /* if we crash before the IO is done, i_size will be small
+ * but these blocks will still be allocated to the file.
+ *
+ * So, add an orphan entry for the file, which will truncate it
+ * down to the i_size it finds after the crash.
+ *
+ * The orphan is cleaned up when the IO is done
+ */
+ if (ret == 0 && buffer_dataguarded(bh) &&
+ list_empty(&EXT3_I(inode)->i_orphan) &&
+ !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS))
+ ret = ext3_orphan_add(handle, inode);
+ return ret;
+}
+
static int ext3_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1185,6 +1373,16 @@ retry:
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, do_journal_get_write_access);
}
+
+ if (ext3_should_guard_data(inode)) {
+ /*
+ * we must do this before the block_write_end because it can
+ * decide to clear the buffer_new bit
+ */
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL, walk_page_for_guard);
+ }
+
write_begin_failed:
if (ret) {
/*
@@ -1212,7 +1410,13 @@ out:
int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
{
- int err = journal_dirty_data(handle, bh);
+ int err;
+
+ /* don't take buffers from the data=guarded list */
+ if (buffer_dataguarded(bh))
+ return 0;
+
+ err = journal_dirty_data(handle, bh);
if (err)
ext3_journal_abort_handle(__func__, __func__,
bh, handle, err);
@@ -1231,6 +1435,32 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
return 0;
}
+/*
+ * This does old-style data=ordered fallback for data=guarded mode.
+ * We must do the old data=ordered mode when filling holes in the
+ * file, since i_size doesn't protect these at all.
+ *
+ * So, we look for new blocks inside i_disksize. We use buffer_datanew
+ * because buffer_new is cleared in many many places in fs/buffer.c
+ */
+static int journal_dirty_data_guarded_fn(handle_t *handle,
+ struct buffer_head *bh)
+{
+ u64 offset = page_offset(bh->b_page) + bh_offset(bh);
+ struct inode *inode = bh->b_page->mapping->host;
+
+ /*
+ * Write could have mapped the buffer but it didn't copy the data in
+ * yet. So avoid filing such buffer into a transaction.
+ */
+ if (buffer_mapped(bh) && buffer_uptodate(bh) &&
+ offset + bh->b_size < EXT3_I(inode)->i_disksize &&
+ buffer_datanew(bh)) {
+ return ext3_journal_dirty_data(handle, bh);
+ }
+ return 0;
+}
+
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1251,10 +1481,8 @@ static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
/* What matters to us is i_disksize. We don't write i_size anywhere */
if (pos + copied > inode->i_size)
i_size_write(inode, pos + copied);
- if (pos + copied > EXT3_I(inode)->i_disksize) {
- EXT3_I(inode)->i_disksize = pos + copied;
+ if (maybe_update_disk_isize(inode, pos + copied))
mark_inode_dirty(inode);
- }
}
/*
@@ -1300,6 +1528,50 @@ static int ext3_ordered_write_end(struct file *file,
return ret ? ret : copied;
}
+static int ext3_guarded_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ handle_t *handle = ext3_journal_current_handle();
+ struct inode *inode = file->f_mapping->host;
+ unsigned from, to;
+ int ret = 0, ret2;
+
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + copied;
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL, journal_dirty_data_guarded_fn);
+
+ /*
+ * we only update the in-memory i_size. The disk i_size is done
+ * by the end io handlers
+ */
+ if (ret == 0 && pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + copied;
+
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size)
+ ext3_orphan_add(handle, inode);
+ ret2 = ext3_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+ return ret ? ret : copied;
+}
+
static int ext3_writeback_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -1311,6 +1583,7 @@ static int ext3_writeback_write_end(struct file *file,
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
update_file_sizes(inode, pos, copied);
+
/*
* There may be allocated blocks outside of i_size because
* we failed to copy some data. Prepare for truncate.
@@ -1439,6 +1712,7 @@ static int bget_one(handle_t *handle, struct buffer_head *bh)
static int bput_one(handle_t *handle, struct buffer_head *bh)
{
+ clear_buffer_datanew(bh);
put_bh(bh);
return 0;
}
@@ -1574,6 +1848,134 @@ out_fail:
return ret;
}
+/*
+ * Completion handler for block_write_full_page(). This will
+ * kick off the data=guarded workqueue as the IO finishes.
+ */
+static void end_buffer_async_write_guarded(struct buffer_head *bh,
+ int uptodate)
+{
+ struct ext3_sb_info *sbi;
+ struct address_space *mapping;
+ struct ext3_ordered_extent *ordered;
+ unsigned long flags;
+
+ mapping = bh->b_page->mapping;
+ if (!mapping || !bh->b_private || !buffer_dataguarded(bh)) {
+noguard:
+ end_buffer_async_write(bh, uptodate);
+ return;
+ }
+
+ /*
+ * the guarded workqueue function checks the uptodate bit on the
+ * bh and uses that to tell the real end_io handler if things worked
+ * out or not.
+ */
+ if (uptodate)
+ set_buffer_uptodate(bh);
+ else
+ clear_buffer_uptodate(bh);
+
+ sbi = EXT3_SB(mapping->host->i_sb);
+
+ spin_lock_irqsave(&sbi->guarded_lock, flags);
+
+ /*
+ * remove any chance that a truncate traced in and cleared
+ * our dataguard flag, which also freed the ordered extent in
+ * our b_private.
+ */
+ if (!buffer_dataguarded(bh)) {
+ spin_unlock_irqrestore(&sbi->guarded_lock, flags);
+ goto noguard;
+ }
+ ordered = bh->b_private;
+ list_add_tail(&ordered->list, &sbi->guarded_buffers);
+ ext3_get_ordered_extent(ordered);
+ spin_unlock_irqrestore(&sbi->guarded_lock, flags);
+
+ queue_work(sbi->guarded_wq, &sbi->guarded_work);
+}
+
+static int ext3_guarded_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct buffer_head *page_bufs;
+ handle_t *handle = NULL;
+ int ret = 0;
+ int err;
+
+ J_ASSERT(PageLocked(page));
+
+ /*
+ * We give up here if we're reentered, because it might be for a
+ * different filesystem.
+ */
+ if (ext3_journal_current_handle())
+ goto out_fail;
+
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, inode->i_sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ page_bufs = page_buffers(page);
+ } else {
+ page_bufs = page_buffers(page);
+ if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
+ NULL, buffer_unmapped)) {
+ /* Provide NULL get_block() to catch bugs if buffers
+ * weren't really mapped */
+ return block_write_full_page_endio(page, NULL, wbc,
+ end_buffer_async_write_guarded);
+ }
+ }
+ handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
+
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_fail;
+ }
+
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bget_one);
+
+ ret = block_write_full_page_endio(page, ext3_get_block, wbc,
+ end_buffer_async_write_guarded);
+
+ /*
+ * The page can become unlocked at any point now, and
+ * truncate can then come in and change things. So we
+ * can't touch *page from now on. But *page_bufs is
+ * safe due to elevated refcount.
+ */
+
+ /*
+ * And attach them to the current transaction. But only if
+ * block_write_full_page() succeeded. Otherwise they are unmapped,
+ * and generally junk.
+ */
+ if (ret == 0) {
+ err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ NULL, journal_dirty_data_guarded_fn);
+ if (!ret)
+ ret = err;
+ }
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ err = ext3_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+
+out_fail:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return ret;
+}
+
+
+
static int ext3_writeback_writepage(struct page *page,
struct writeback_control *wbc)
{
@@ -1842,6 +2244,21 @@ static const struct address_space_operations ext3_writeback_aops = {
.is_partially_uptodate = block_is_partially_uptodate,
};
+static const struct address_space_operations ext3_guarded_aops = {
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_guarded_writepage,
+ .sync_page = block_sync_page,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_guarded_write_end,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .direct_IO = ext3_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+};
+
static const struct address_space_operations ext3_journalled_aops = {
.readpage = ext3_readpage,
.readpages = ext3_readpages,
@@ -1860,6 +2277,8 @@ void ext3_set_aops(struct inode *inode)
{
if (ext3_should_order_data(inode))
inode->i_mapping->a_ops = &ext3_ordered_aops;
+ else if (ext3_should_guard_data(inode))
+ inode->i_mapping->a_ops = &ext3_guarded_aops;
else if (ext3_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext3_writeback_aops;
else
@@ -2376,7 +2795,8 @@ void ext3_truncate(struct inode *inode)
if (!ext3_can_truncate(inode))
return;
- if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+ if (inode->i_size == 0 && (ext3_should_writeback_data(inode) ||
+ ext3_should_guard_data(inode)))
ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
/*
@@ -3107,13 +3527,49 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
handle_t *handle;
+ /*
+ * we need to make sure any data=guarded pages
+ * are on disk before we force a new disk i_size
+ * down into the inode. The crucial range is
+ * anything between the disksize on disk now
+ * and the new size we're going to set.
+ *
+ * We're holding i_mutex here, so we know new
+ * ordered extents are not going to appear in the inode
+ */
+ if (ext3_should_guard_data(inode) &&
+ EXT3_I(inode)->i_disksize < attr->ia_size) {
+ filemap_write_and_wait_range(inode->i_mapping,
+ EXT3_I(inode)->i_disksize,
+ attr->ia_size);
+ }
handle = ext3_journal_start(inode, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
}
+ /*
+ * chop off all the ordered extents after the new
+ * i_size. This prevents leaking of ordered extents
+ * and it also makes sure we the ordered extent code
+ * doesn't mess with the orphan link
+ */
+ if (ext3_should_guard_data(inode))
+ ext3_truncate_ordered_extents(inode, attr->ia_size);
+
error = ext3_orphan_add(handle, inode);
+
+ /*
+ * this is pretty confusing, but we don't need to worry
+ * about guarded i_size here because ext3 truncate fixes
+ * it to the correct i_size when the truncate is all done,
+ * and the ext3_orphan_add makes sure we'll have a sane
+ * i_size after a crash
+ *
+ * Hmmm, why do we set i_disksize here and then again
+ * in ext3_truncate?
+ */
EXT3_I(inode)->i_disksize = attr->ia_size;
rc = ext3_mark_inode_dirty(handle, inode);
if (!error)
@@ -2410,7 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
ext3_mark_inode_dirty(handle, new_inode);
if (!new_inode->i_nlink)
ext3_orphan_add(handle, new_inode);
- if (ext3_should_writeback_data(new_inode))
+ if (ext3_should_writeback_data(new_inode) ||
+ ext3_should_guard_data(new_inode))
flush_file = 1;
}
retval = 0;
new file mode 100644
@@ -0,0 +1,304 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/buffer_head.h>
+#include <linux/ext3_jbd.h>
+
+
+/*
+ * These routines actually implement data=guarded mode, but the
+ * idea is that it may replace data=ordered mode as it becomes stable.
+ */
+
+static u64 entry_end(struct ext3_ordered_extent *entry, unsigned blocksize)
+{
+ return entry->start + blocksize;
+}
+
+/*
+ * returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 start,
+ unsigned blocksize, struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ext3_ordered_extent *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct ext3_ordered_extent, rb_node);
+
+ if (start < entry->start)
+ p = &(*p)->rb_left;
+ else if (start >= entry_end(entry, blocksize))
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * find the ordered struct that has this offset
+ */
+static inline struct rb_node *tree_search(struct ext3_ordered_inode_tree *tree,
+ u64 start, unsigned blocksize)
+{
+ struct rb_node *ret;
+ struct rb_node *n = tree->tree.rb_node;
+ struct ext3_ordered_extent *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct ext3_ordered_extent, rb_node);
+
+ if (start < entry->start)
+ n = n->rb_left;
+ else if (start >= entry_end(entry, blocksize))
+ n = n->rb_right;
+ else
+ return n;
+ }
+ return NULL;
+ return ret;
+}
+
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * start is the logical offset in the file
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted, and it also takes a reference on the buffer head.
+ */
+int ext3_add_ordered_extent(struct inode *inode, u64 start,
+ struct buffer_head *bh)
+{
+ struct ext3_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct ext3_ordered_extent *entry;
+ int ret = 0;
+
+ lock_buffer(bh);
+
+ /* ordered extent already there, or in old style data=ordered */
+ if (bh->b_private) {
+ ret = 0;
+ goto out;
+ }
+
+ tree = &EXT3_I(inode)->ordered_tree;
+ entry = kzalloc(sizeof(*entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock(&tree->lock);
+ entry->start = start;
+
+ get_bh(bh);
+ entry->bh = bh;
+ bh->b_private = entry;
+ set_buffer_dataguarded(bh);
+
+ /* one ref for the tree */
+ atomic_set(&entry->refs, 1);
+ INIT_LIST_HEAD(&entry->list);
+ node = tree_insert(&tree->tree, start,
+ inode->i_sb->s_blocksize, &entry->rb_node);
+ BUG_ON(node);
+
+ spin_unlock(&tree->lock);
+out:
+ unlock_buffer(bh);
+ return ret;
+}
+
+/*
+ * used to drop a reference on an ordered extent. This will free
+ * the extent if the last reference is dropped
+ */
+int ext3_put_ordered_extent(struct ext3_ordered_extent *entry)
+{
+ if (atomic_dec_and_test(&entry->refs)) {
+ WARN_ON(entry->bh);
+ kfree(entry);
+ }
+ return 0;
+}
+
+/*
+ * remove an ordered extent from the tree. This removes the
+ * reference held by the rbtree on 'entry' and the
+ * reference on the buffer head held by the entry.
+ */
+int ext3_remove_ordered_extent(struct inode *inode,
+ struct ext3_ordered_extent *entry)
+{
+ struct ext3_ordered_inode_tree *tree;
+ struct rb_node *node;
+
+ tree = &EXT3_I(inode)->ordered_tree;
+ node = &entry->rb_node;
+ clear_buffer_dataguarded(entry->bh);
+ entry->bh->b_private = NULL;
+ brelse(entry->bh);
+ rb_erase(node, &tree->tree);
+ ext3_put_ordered_extent(entry);
+ return 0;
+}
+
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size. i_size is updated to cover any fully written part of the file.
+ *
+ * This returns < 0 on error, zero if no action needs to be taken and
+ * 1 if the inode must be logged.
+ */
+int ext3_ordered_update_i_size(struct inode *inode,
+ struct ext3_ordered_extent *entry)
+{
+ u64 new_i_size;
+ u64 i_size_test;
+ u64 disk_i_size;
+ struct rb_node *node;
+ struct ext3_ordered_extent *test;
+ unsigned blocksize = inode->i_sb->s_blocksize;
+ int ret = 0;
+
+ disk_i_size = EXT3_I(inode)->i_disksize;
+
+ /*
+ * if the disk i_size is already at the inode->i_size, or
+ * this ordered extent is inside the disk i_size, we're done
+ */
+ if (disk_i_size >= inode->i_size ||
+ entry_end(entry, blocksize) <= disk_i_size)
+ goto out;
+
+ /*
+ * walk backward from this ordered extent to disk_i_size.
+ * if we find an ordered extent then we can't update disk i_size
+ * yet
+ */
+ node = &entry->rb_node;
+ while (1) {
+ node = rb_prev(node);
+ if (!node)
+ break;
+ test = rb_entry(node, struct ext3_ordered_extent, rb_node);
+ if (entry_end(test, blocksize) <= disk_i_size)
+ break;
+ if (test->start >= inode->i_size)
+ break;
+ if (test->start >= disk_i_size)
+ goto out;
+ }
+
+ /* from here, the caller will have to log something, set ret to 1 */
+ ret = 1;
+
+ /*
+ * at this point, we know we can safely update i_size to at least
+ * the offset from this ordered extent. But, we need to
+ * walk forward and see if ios from higher up in the file have
+ * finished.
+ */
+ node = rb_next(&entry->rb_node);
+ i_size_test = 0;
+ if (node) {
+ /*
+ * do we have an area where IO might have finished
+ * between our ordered extent and the next one.
+ */
+ test = rb_entry(node, struct ext3_ordered_extent, rb_node);
+ if (test->start >= entry_end(entry, blocksize))
+ i_size_test = test->start;
+ } else {
+ i_size_test = i_size_read(inode);
+ }
+
+ new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+
+ EXT3_I(inode)->i_disksize = new_i_size;
+out:
+ return ret;
+}
+
+/*
+ * during a truncate or delete, we need to get rid of pending
+ * ordered extents so that isn't a war over who updates disk i_size first.
+ * This does that, without waiting for any of the IO to actually finish.
+ *
+ * When the IO does finish, it will find the ordered extent removed from the
+ * tree and all will work properly.
+ */
+void ext3_truncate_ordered_extents(struct inode *inode, u64 offset)
+{
+ struct ext3_ordered_inode_tree *tree = &EXT3_I(inode)->ordered_tree;
+ struct rb_node *node;
+ struct ext3_ordered_extent *test;
+
+ spin_lock(&tree->lock);
+ node = rb_last(&tree->tree);
+ while(node) {
+ test = rb_entry(node, struct ext3_ordered_extent, rb_node);
+ /* truncate is going to end up waiting for pages that
+ * straddle the new i_size. So, we can drop the
+ * ordered record for anything that starts before the new
+ * i_size.
+ *
+ * The main goal here is to avoid confusion between the
+ * code in truncate and our end_io handlers
+ */
+ if (test->start < offset)
+ break;
+ node = rb_prev(&test->rb_node);
+
+ /*
+ * once this is called, the end_io handler won't run,
+ * and we won't update disk_i_size to include this buffer.
+ *
+ * That's ok for truncates because the truncate code is
+ * writing a new i_size.
+ *
+ * This ignores any IO in flight, which is ok
+ * because the guarded_buffers list has a reference
+ * on the ordered extent
+ */
+ ext3_remove_ordered_extent(inode, test);
+ }
+ spin_unlock(&tree->lock);
+ return;
+
+}
+
+void ext3_ordered_inode_init(struct ext3_inode_info *ei)
+{
+ ei->ordered_tree.tree.rb_node = NULL;
+ spin_lock_init(&ei->ordered_tree.lock);
+}
+
@@ -37,6 +37,7 @@
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/log2.h>
+#include <linux/workqueue.h>
#include <asm/uaccess.h>
@@ -399,6 +400,9 @@ static void ext3_put_super (struct super_block * sb)
struct ext3_super_block *es = sbi->s_es;
int i, err;
+ flush_workqueue(sbi->guarded_wq);
+ destroy_workqueue(sbi->guarded_wq);
+
ext3_xattr_put_super(sb);
err = journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -468,6 +472,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
#endif
ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
+ ext3_ordered_inode_init(ei);
+
return &ei->vfs_inode;
}
@@ -481,6 +487,8 @@ static void ext3_destroy_inode(struct inode *inode)
false);
dump_stack();
}
+ if (EXT3_I(inode)->ordered_tree.tree.rb_node)
+ printk("EXT3 ordered data tree not empty on clear inode\n");
kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
}
@@ -528,6 +536,13 @@ static void ext3_clear_inode(struct inode *inode)
EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
}
#endif
+ /*
+ * If pages got cleaned by truncate, truncate should have
+ * gotten rid of the ordered extents. Just in case, drop them
+ * here.
+ */
+ ext3_truncate_ordered_extents(inode, 0);
+
ext3_discard_reservation(inode);
EXT3_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
@@ -634,6 +649,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",data=journal");
else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
seq_puts(seq, ",data=ordered");
+ else if (test_opt(sb, GUARDED_DATA) == EXT3_MOUNT_GUARDED_DATA)
+ seq_puts(seq, ",data=guarded");
else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
seq_puts(seq, ",data=writeback");
@@ -790,7 +807,7 @@ enum {
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore,
+ Opt_data_guarded, Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -832,6 +849,7 @@ static const match_table_t tokens = {
{Opt_abort, "abort"},
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
+ {Opt_data_guarded, "data=guarded"},
{Opt_data_writeback, "data=writeback"},
{Opt_data_err_abort, "data_err=abort"},
{Opt_data_err_ignore, "data_err=ignore"},
@@ -1034,6 +1052,9 @@ static int parse_options (char *options, struct super_block *sb,
case Opt_data_ordered:
data_opt = EXT3_MOUNT_ORDERED_DATA;
goto datacheck;
+ case Opt_data_guarded:
+ data_opt = EXT3_MOUNT_GUARDED_DATA;
+ goto datacheck;
case Opt_data_writeback:
data_opt = EXT3_MOUNT_WRITEBACK_DATA;
datacheck:
@@ -1949,11 +1970,23 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
clear_opt(sbi->s_mount_opt, NOBH);
}
}
+
+ /*
+ * setup the guarded work list
+ */
+ INIT_LIST_HEAD(&EXT3_SB(sb)->guarded_buffers);
+ INIT_WORK(&EXT3_SB(sb)->guarded_work, ext3_run_guarded_work);
+ spin_lock_init(&EXT3_SB(sb)->guarded_lock);
+ EXT3_SB(sb)->guarded_wq = create_workqueue("ext3-guard");
+ if (!EXT3_SB(sb)->guarded_wq) {
+ printk(KERN_ERR "EXT3-fs: failed to create workqueue\n");
+ goto failed_mount_guard;
+ }
+
/*
* The journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
*/
-
root = ext3_iget(sb, EXT3_ROOT_INO);
if (IS_ERR(root)) {
printk(KERN_ERR "EXT3-fs: get root inode failed\n");
@@ -1965,6 +1998,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
goto failed_mount4;
}
+
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
printk(KERN_ERR "EXT3-fs: get root dentry failed\n");
@@ -1974,6 +2008,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
}
ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
@@ -1989,9 +2024,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
printk (KERN_INFO "EXT3-fs: recovery complete.\n");
ext3_mark_recovery_complete(sb, es);
printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
- test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
- test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
- "writeback");
+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
+ test_opt(sb,GUARDED_DATA) == EXT3_MOUNT_GUARDED_DATA ? "guarded":
+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+ "writeback");
lock_kernel();
return 0;
@@ -2003,6 +2039,8 @@ cantfind_ext3:
goto failed_mount;
failed_mount4:
+ destroy_workqueue(EXT3_SB(sb)->guarded_wq);
+failed_mount_guard:
journal_destroy(sbi->s_journal);
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
@@ -2018,6 +2018,7 @@ zap_buffer_unlocked:
clear_buffer_mapped(bh);
clear_buffer_req(bh);
clear_buffer_new(bh);
+ clear_buffer_datanew(bh);
bh->b_bdev = NULL;
return may_free;
}
@@ -18,6 +18,7 @@
#include <linux/types.h>
#include <linux/magic.h>
+#include <linux/workqueue.h>
/*
* The second extended filesystem constants/structures
@@ -398,7 +399,6 @@ struct ext3_inode {
#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
-#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
@@ -414,6 +414,12 @@ struct ext3_inode {
#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
* error in ordered mode */
+#define EXT3_MOUNT_GUARDED_DATA 0x800000 /* guard new writes with
+ i_size */
+#define EXT3_MOUNT_DATA_FLAGS (EXT3_MOUNT_JOURNAL_DATA | \
+ EXT3_MOUNT_ORDERED_DATA | \
+ EXT3_MOUNT_WRITEBACK_DATA | \
+ EXT3_MOUNT_GUARDED_DATA)
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
@@ -892,6 +898,7 @@ extern void ext3_get_inode_flags(struct ext3_inode_info *);
extern void ext3_set_aops(struct inode *inode);
extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
+void ext3_run_guarded_work(struct work_struct *work);
/* ioctl.c */
extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
@@ -945,7 +952,31 @@ extern const struct inode_operations ext3_special_inode_operations;
extern const struct inode_operations ext3_symlink_inode_operations;
extern const struct inode_operations ext3_fast_symlink_inode_operations;
+/* ordered-data.c */
+int ext3_add_ordered_extent(struct inode *inode, u64 file_offset,
+ struct buffer_head *bh);
+int ext3_put_ordered_extent(struct ext3_ordered_extent *entry);
+int ext3_remove_ordered_extent(struct inode *inode,
+ struct ext3_ordered_extent *entry);
+int ext3_ordered_update_i_size(struct inode *inode,
+ struct ext3_ordered_extent *entry);
+void ext3_ordered_inode_init(struct ext3_inode_info *ei);
+void ext3_truncate_ordered_extents(struct inode *inode, u64 offset);
+
+static inline void ext3_ordered_lock(struct inode *inode)
+{
+ spin_lock(&EXT3_I(inode)->ordered_tree.lock);
+}
+static inline void ext3_ordered_unlock(struct inode *inode)
+{
+ spin_unlock(&EXT3_I(inode)->ordered_tree.lock);
+}
+
+static inline void ext3_get_ordered_extent(struct ext3_ordered_extent *entry)
+{
+ atomic_inc(&entry->refs);
+}
#endif /* __KERNEL__ */
#endif /* _LINUX_EXT3_FS_H */
@@ -20,6 +20,7 @@
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
+#include <linux/rbtree.h>
/* data type for block offset of block group */
typedef int ext3_grpblk_t;
@@ -65,6 +66,39 @@ struct ext3_block_alloc_info {
#define rsv_end rsv_window._rsv_end
/*
+ * used to prevent garbage in files after a crash by
+ * making sure i_size isn't updated until after the IO
+ * is done.
+ *
+ * See fs/ext3/ordered-data.c for the code that uses these.
+ */
+struct buffer_head;
+struct ext3_ordered_inode_tree {
+ /* protects the tree and disk i_size */
+ spinlock_t lock;
+
+ /* rb tree of IO that needs to happen before i_size can be updated */
+ struct rb_root tree;
+};
+
+struct ext3_ordered_extent {
+ /* logical offset of the block in the file */
+ u64 start;
+
+ /* buffer head being written */
+ struct buffer_head *bh;
+
+ /* number of refs on this ordered extent */
+ atomic_t refs;
+
+ /* for the rb tree */
+ struct rb_node rb_node;
+
+ /* list of things being processed by the workqueue */
+ struct list_head list;
+};
+
+/*
* third extended file system inode data in memory
*/
struct ext3_inode_info {
@@ -141,6 +175,8 @@ struct ext3_inode_info {
* by other means, so we have truncate_mutex.
*/
struct mutex truncate_mutex;
+
+ struct ext3_ordered_inode_tree ordered_tree;
struct inode vfs_inode;
};
@@ -21,6 +21,7 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#include <linux/workqueue.h>
#endif
#include <linux/rbtree.h>
@@ -82,6 +83,11 @@ struct ext3_sb_info {
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
+
+ struct workqueue_struct *guarded_wq;
+ struct work_struct guarded_work;
+ struct list_head guarded_buffers;
+ spinlock_t guarded_lock;
};
static inline spinlock_t *
@@ -212,6 +212,17 @@ static inline int ext3_should_order_data(struct inode *inode)
return 0;
}
+static inline int ext3_should_guard_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 0;
+ if (test_opt(inode->i_sb, GUARDED_DATA) == EXT3_MOUNT_GUARDED_DATA)
+ return 1;
+ return 0;
+}
+
static inline int ext3_should_writeback_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
@@ -291,6 +291,13 @@ enum jbd_state_bits {
BH_State, /* Pins most journal_head state */
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
+ BH_DataGuarded, /* ext3 data=guarded mode buffer
+ * these have something other than a
+ * journal_head at b_private */
+ BH_DataNew, /* BH_new gets cleared too early for
+ * data=guarded to use it. So,
+ * this gets set instead.
+ */
};
BUFFER_FNS(JBD, jbd)
@@ -302,6 +309,8 @@ TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
+BUFFER_FNS(DataGuarded, dataguarded)
+BUFFER_FNS(DataNew, datanew)
static inline struct buffer_head *jh2bh(struct journal_head *jh)
{
@@ -441,6 +441,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
}
return err;
}
+EXPORT_SYMBOL(filemap_write_and_wait_range);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
@@ -360,7 +360,7 @@ still_busy:
* Completion handler for block_write_full_page() - pages which are unlocked
* during I/O, and which have PageWriteback cleared upon I/O completion.
*/
-static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
char b[BDEVNAME_SIZE];
unsigned long flags;
@@ -438,11 +438,17 @@ static void mark_buffer_async_read(struct buffer_head *bh)
set_buffer_async_read(bh);
}
-void mark_buffer_async_write(struct buffer_head *bh)
+void mark_buffer_async_write_endio(struct buffer_head *bh,
+ bh_end_io_t *handler)
{
- bh->b_end_io = end_buffer_async_write;
+ bh->b_end_io = handler;
set_buffer_async_write(bh);
}
+
+void mark_buffer_async_write(struct buffer_head *bh)
+{
+ mark_buffer_async_write_endio(bh, end_buffer_async_write);
+}
EXPORT_SYMBOL(mark_buffer_async_write);
@@ -1608,7 +1614,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
* unplugging the device queue.
*/
static int __block_write_full_page(struct inode *inode, struct page *page,
- get_block_t *get_block, struct writeback_control *wbc)
+ get_block_t *get_block, struct writeback_control *wbc,
+ bh_end_io_t *handler)
{
int err;
sector_t block;
@@ -1693,7 +1700,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
continue;
}
if (test_clear_buffer_dirty(bh)) {
- mark_buffer_async_write(bh);
+ mark_buffer_async_write_endio(bh, handler);
} else {
unlock_buffer(bh);
}
@@ -1746,7 +1753,7 @@ recover:
if (buffer_mapped(bh) && buffer_dirty(bh) &&
!buffer_delay(bh)) {
lock_buffer(bh);
- mark_buffer_async_write(bh);
+ mark_buffer_async_write_endio(bh, handler);
} else {
/*
* The buffer may have been set dirty during
@@ -2672,7 +2679,8 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
out:
ret = mpage_writepage(page, get_block, wbc);
if (ret == -EAGAIN)
- ret = __block_write_full_page(inode, page, get_block, wbc);
+ ret = __block_write_full_page(inode, page, get_block, wbc,
+ end_buffer_async_write);
return ret;
}
EXPORT_SYMBOL(nobh_writepage);
@@ -2830,9 +2838,10 @@ out:
/*
* The generic ->writepage function for buffer-backed address_spaces
+ * this form passes in the end_io handler used to finish the IO.
*/
-int block_write_full_page(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc)
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
+ struct writeback_control *wbc, bh_end_io_t *handler)
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
@@ -2841,7 +2850,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
/* Is the page fully inside i_size? */
if (page->index < end_index)
- return __block_write_full_page(inode, page, get_block, wbc);
+ return __block_write_full_page(inode, page, get_block, wbc,
+ handler);
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2864,9 +2874,20 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
* writes to that region are not written out to the file."
*/
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
- return __block_write_full_page(inode, page, get_block, wbc);
+ return __block_write_full_page(inode, page, get_block, wbc, handler);
}
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
+int block_write_full_page(struct page *page, get_block_t *get_block,
+ struct writeback_control *wbc)
+{
+ return block_write_full_page_endio(page, get_block, wbc,
+ end_buffer_async_write);
+}
+
+
sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
get_block_t *get_block)
{
@@ -3335,9 +3356,11 @@ EXPORT_SYMBOL(block_read_full_page);
EXPORT_SYMBOL(block_sync_page);
EXPORT_SYMBOL(block_truncate_page);
EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(block_write_full_page_endio);
EXPORT_SYMBOL(cont_write_begin);
EXPORT_SYMBOL(end_buffer_read_sync);
EXPORT_SYMBOL(end_buffer_write_sync);
+EXPORT_SYMBOL(end_buffer_async_write);
EXPORT_SYMBOL(file_fsync);
EXPORT_SYMBOL(generic_block_bmap);
EXPORT_SYMBOL(generic_cont_expand_simple);
@@ -155,6 +155,7 @@ void create_empty_buffers(struct page *, unsigned long,
unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
+void end_buffer_async_write(struct buffer_head *bh, int uptodate);
/* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
@@ -197,6 +198,8 @@ extern int buffer_heads_over_limit;
void block_invalidatepage(struct page *page, unsigned long offset);
int block_write_full_page(struct page *page, get_block_t *get_block,
struct writeback_control *wbc);
+int block_write_full_page_endio(struct page *page, get_block_t *get_block,
+ struct writeback_control *wbc, bh_end_io_t *handler);
int block_read_full_page(struct page*, get_block_t*);
int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
unsigned long from);