@@ -283,6 +283,9 @@ struct ext4_io_submit {
~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Set the low bits to get the last block in a cluster */
+#define EXT4_LBLK_CFILL(s, lblk) ((lblk) | \
+ ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
@@ -2468,6 +2471,7 @@ extern int ext4_page_mkwrite(struct vm_fault *vmf);
extern int ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
+extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
@@ -3137,6 +3141,8 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+extern void ext4_release_reservations(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t len);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -4627,6 +4627,19 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /*
+ * call to ext4_ext_remove_space() must precede ext4_es_remove_extent()
+ * for correct cluster reservation accounting
+ */
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ if (err)
+ return err;
+
+ if (test_opt(inode->i_sb, DELALLOC))
+ ext4_release_reservations(inode, last_block,
+ EXT_MAX_BLOCKS - last_block);
+
retry:
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
@@ -4635,9 +4648,7 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- if (err)
- return err;
- return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ return err;
}
static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
@@ -4975,6 +4986,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
}
out:
inode_unlock(inode);
+
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
return ret;
}
@@ -5528,18 +5540,27 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ret = ext4_es_remove_extent(inode, punch_start,
- EXT_MAX_BLOCKS - punch_start);
+ /*
+ * call to ext4_ext_remove_space() must precede ext4_es_remove_extent()
+ * for correct cluster reservation accounting
+ */
+ ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
- ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ if (test_opt(inode->i_sb, DELALLOC))
+ ext4_release_reservations(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start);
+
+ ret = ext4_es_remove_extent(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
+
ext4_discard_preallocations(inode);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
@@ -6010,3 +6031,38 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
return err ? err : mapped;
}
+
+/*
+ * releases the reservations on the delayed allocated clusters found in
+ * the block range extending from @start for @len blocks, inclusive
+ */
+void ext4_release_reservations(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t len)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int n = 0;
+ unsigned long long start_partial, end_partial;
+ int ret;
+
+ n = ext4_es_delayed_clu_partials(inode, start, len, &start_partial,
+ &end_partial);
+
+ if (sbi->s_cluster_ratio > 1) {
+ if (start_partial != ~0) {
+ ret = ext4_clu_mapped(inode, start_partial);
+ if (ret < 0)
+ goto out;
+ n++;
+ }
+
+ if ((end_partial != ~0) && (end_partial != start_partial)) {
+ ret = ext4_clu_mapped(inode, end_partial);
+ if (ret < 0)
+ goto out;
+ n++;
+ }
+ }
+
+out:
+ ext4_da_release_space(inode, (int) n);
+}
@@ -1398,3 +1398,161 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
return n;
}
+
+/*
+ * Returns true if there is at least one delayed and not unwritten extent
+ * (a delayed extent whose blocks have not been allocated for an unwritten
+ * extent) in the range specified by @start and @end. Returns false if not.
+ */
+static bool __es_delayed_range(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+
+ es = __es_tree_search(&tree->root, start);
+
+ while (es && (es->es_lblk <= end)) {
+ if (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es))
+ return true;
+ node = rb_next(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+ return false;
+}
+
+/*
+ * Returns true if there are no extents marked written, unwritten, or
+ * delayed anywhere in the range specified by @start and @end. Returns
+ * false otherwise.
+ */
+static bool __es_empty_range(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+
+ es = __es_tree_search(&tree->root, start);
+
+ while (es && (es->es_lblk <= end)) {
+ if (!ext4_es_is_hole(es))
+ return false;
+ node = rb_next(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+ return true;
+}
+
+/*
+ * This function makes a potentially approximate count of the number of
+ * delalloc clusters in the range specified by @lblk and @len. It returns two
+ * kinds of information. It returns the number of whole clusters that
+ * contain delalloc blocks within the specified range. If these clusters are
+ * free of written or unwritten blocks, this is the number of cluster
+ * reservations that should be released if these clusters were to be
+ * deleted.
+ *
+ * It also returns the logical block numbers of partial clusters (if any) at
+ * the start and end of the specified range that could contribute to the
+ * number of reservations that should be released if the entire range was
+ * to be deleted via @start_partial and @end_partial. If a partial cluster
+ * candidate is found, it does not contain written or unwritten blocks
+ * and the remainder of the cluster as found in the extent status tree
+ * does not contain written, unwritten, or delayed blocks. A partial cluster
+ * can contribute to the total delalloc cluster count if the remainder of
+ * the cluster does not contain a written block as recorded in the
+ * extent tree. If a starting or ending delalloc partial cluster candidate
+ * is not found, @start_partial or @end_partial will be set to ~0.
+
+ * This function's interface is meant to be similar to ext4_es_remove_extent()
+ * to facilitate integration with that or a similar function in the future
+ * to avoid an extra pass over the extents status tree.
+ */
+unsigned int ext4_es_delayed_clu_partials(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ unsigned long long *start_partial,
+ unsigned long long *end_partial)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_lblk_t end, next, last, end_clu;
+ unsigned int n = 0;
+
+ /* guaranteed to be unequal to any ext4_lblk_t value */
+ *start_partial = *end_partial = ~0;
+
+ if (len == 0)
+ return 0;
+
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
+ read_lock(&ei->i_es_lock);
+
+ /*
+ * Examine the starting partial cluster, if any, for a possible delalloc
+ * cluster candidate
+ */
+ end_clu = EXT4_LBLK_CFILL(sbi, lblk);
+ if (EXT4_LBLK_COFF(sbi, lblk)) {
+ /* find first cluster's last block - cluster end or range end */
+ if (end_clu < end)
+ last = end_clu;
+ else
+ last = end;
+ if (__es_empty_range(inode, EXT4_LBLK_CMASK(sbi, lblk),
+ lblk - 1) &&
+ __es_delayed_range(inode, lblk, last)) {
+ *start_partial = EXT4_B2C(sbi, lblk);
+ }
+ next = last + 1;
+ } else {
+ next = lblk;
+ }
+
+ /*
+ * Count the delayed clusters in the cluster-aligned region, if
+ * present. next will be aligned on the start of a cluster.
+ */
+ if ((next <= end) && (EXT4_LBLK_CFILL(sbi, next) <= end)) {
+ if (EXT4_LBLK_CFILL(sbi, end) == end)
+ /* single cluster case */
+ last = end;
+ else
+ /* multiple cluster case */
+ last = EXT4_LBLK_CMASK(sbi, end) - 1;
+ n = __es_delayed_clu(inode, next, last);
+ next = last + 1;
+ }
+
+ /*
+ * Examine the ending partial cluster, if any, for a possible delalloc
+ * cluster candidate
+ */
+ end_clu = EXT4_LBLK_CFILL(sbi, end);
+ if (end != end_clu) {
+ if (next <= end) {
+ /* ending partial cluster case */
+ if (__es_delayed_range(inode, next, end) &&
+ __es_empty_range(inode, end + 1, end_clu)) {
+ *end_partial = EXT4_B2C(sbi, end);
+ }
+ } else {
+ /* single partial cluster in range case */
+ if ((*start_partial != ~0) &&
+ (!__es_empty_range(inode, end + 1, end_clu))) {
+ *start_partial = ~0;
+ }
+ }
+ }
+
+ read_unlock(&ei->i_es_lock);
+
+ return n;
+}
@@ -186,6 +186,12 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
+extern unsigned int ext4_es_delayed_clu_partials(struct inode *inode,
+ ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ unsigned long long *start_partial,
+ unsigned long long *end_partial);
+
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
@@ -378,9 +378,9 @@ void ext4_da_update_reserve_space(struct inode *inode,
dquot_claim_block(inode, EXT4_C2B(sbi, used));
else {
/*
- * We did fallocate with an offset that is already delayed
+ * We allocated a block with an offset that is already delayed
* allocated. So on delayed allocated writeback we should
- * not re-claim the quota for fallocated blocks.
+ * not re-claim the quota for a previously allocated block.
*/
dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
}
@@ -1593,7 +1593,7 @@ static int ext4_da_reserve_space(struct inode *inode)
return 0; /* success */
}
-static void ext4_da_release_space(struct inode *inode, int to_free)
+void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4325,19 +4325,31 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ret = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
-
+ /*
+ * call to ext4_ext_remove_space() must precede ext4_es_remove_extent()
+ * for correct cluster reservation accounting
+ */
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_remove_space(inode, first_block,
stop_block - 1);
else
ret = ext4_ind_remove_space(handle, inode, first_block,
stop_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ if (test_opt(inode->i_sb, DELALLOC))
+ ext4_release_reservations(inode, first_block,
+ stop_block - first_block);
+
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
Once it's possible to accurately determine the number of reserved clusters outstanding after all allocated blocks in a block range have been removed from both the extent tree and extents status tree, determining the number of reserved clusters to be subtracted from the reserved cluster count is a relatively straightforward matter of counting the number of clusters belonging to delayed extents in the extents status tree which are not shared with any other allocated or delayed allocated extents. This can be achieved by reversing the current order in which ext4_ext_remove_space() and ext4_es_remove_extent() are called. For now, a call to a new function to count the delayed clusters in the extents status tree and to adjust the reserved cluster total is inserted between these calls. This could also be integrated in a new version of ext4_es_remove_extent() to avoid a second pass over the extents status tree if performance becomes a concern. Determining whether a delayed allocated cluster is to be included in the total to be subtracted when a block range is removed is a little involved in the code, but the principle is straightforward. A delayed allocated cluster wholly contained within the block range to be removed is counted unconditionally. A delayed allocated cluster that is not wholly contained within the range (referred to as a partial cluster in the code) only counts towards the total if none of the blocks in the cluster not contained within the extent are not included in either another delayed allocated or allocated extent. Signed-off-by: Eric Whitney <enwlinux@gmail.com> --- fs/ext4/ext4.h | 6 ++ fs/ext4/extents.c | 68 ++++++++++++++++++-- fs/ext4/extents_status.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 6 ++ fs/ext4/inode.c | 32 +++++++--- 5 files changed, 254 insertions(+), 16 deletions(-)