@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o
+ ext4_jbd2.o migrate.o mballoc.o defrag.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
new file mode 100644
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2009, NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ * Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Online defragmentation for EXT4 */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "group.h"
+
+/**
+ * ext4_defrag_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode: inode which is searched
+ * @path: this will obtain data for the next extent
+ * @extent: pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@org_path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @org_path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+ext4_defrag_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent)
+{
+ int ppos, leaf_ppos = path->p_depth;
+
+ ppos = leaf_ppos;
+ if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+ /* leaf block */
+ *extent = ++path[ppos].p_ext;
+ return 0;
+ }
+
+ while (--ppos >= 0) {
+ if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+ path[ppos].p_idx) {
+ int cur_ppos = ppos;
+
+ /* index block */
+ path[ppos].p_idx++;
+ path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+ if (path[ppos+1].p_bh)
+ brelse(path[ppos+1].p_bh);
+ path[ppos+1].p_bh =
+ sb_bread(inode->i_sb, path[ppos].p_block);
+ if (!path[ppos+1].p_bh)
+ goto err;
+ path[ppos+1].p_hdr =
+ ext_block_hdr(path[ppos+1].p_bh);
+
+ /* Halfway index block */
+ while (++cur_ppos < leaf_ppos) {
+ path[cur_ppos].p_idx =
+ EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+ path[cur_ppos].p_block =
+ idx_pblock(path[cur_ppos].p_idx);
+ if (path[cur_ppos+1].p_bh)
+ brelse(path[cur_ppos+1].p_bh);
+ path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+ path[cur_ppos].p_block);
+ if (!path[cur_ppos+1].p_bh)
+ goto err;
+ path[cur_ppos+1].p_hdr =
+ ext_block_hdr(path[cur_ppos+1].p_bh);
+ }
+
+ /* leaf block */
+ path[leaf_ppos].p_ext = *extent =
+ EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ return 0;
+ }
+ }
+ /* We found the last extent */
+ return 1;
+err:
+ if (path)
+ ext4_ext_drop_refs(path);
+ return -EIO;
+}
+
+static int
+ext4_defrag_partial(struct file *o_filp, struct inode *dest_inode,
+ pgoff_t org_page_offset, int data_offset_in_page,
+ int block_len_in_page)
+{
+ return 0;
+}
+
+/**
+ * ext4_defrag_check - Check the environment whether a defrag can be done
+ *
+ * @org_inode: original inode
+ * @dest_inode: destination inode
+ * @start: defrag start offset in blocks
+ * @len: defrag size in blocks
+ *
+ * Check the arguments of ext4_defrag() whether the files can do defrag.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+ext4_defrag_check_arguments(struct inode *org_inode, struct inode *dest_inode,
+ ext4_lblk_t start, ext4_lblk_t len)
+{
+ if (!S_ISREG(org_inode->i_mode) || !S_ISREG(dest_inode->i_mode)) {
+ printk(KERN_ERR "ext4 defrag: "
+ "The argument files should be regular file\n");
+ return -EINVAL;
+ }
+
+ if (org_inode->i_rdev != dest_inode->i_rdev) {
+ printk(KERN_ERR "ext4 defrag: "
+ "The argument files should be in same filesystem \n");
+ return -EINVAL;
+ }
+
+ if (org_inode->i_ino == dest_inode->i_ino) {
+ printk(KERN_ERR "ext4 defrag: "
+ "The argument files should not be same\n");
+ return -EINVAL;
+ }
+
+ /* Ext4 online defrag supports only extent based file */
+ if (!(EXT4_I(org_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ printk(KERN_ERR "ext4 defrag: ino[%lu] is not extents "
+ "based file\n", org_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+ if (!(EXT4_I(dest_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ printk(KERN_ERR "ext4 defrag: ino[%lu] is not extents "
+ "based file\n", dest_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ if (i_size_read(org_inode) > i_size_read(dest_inode)) {
+ if (start >= i_size_read(dest_inode)) {
+ printk(KERN_ERR "ext4 defrag: Start offset[%u] is "
+ "larger than destination file size\n",
+ start);
+ return -EINVAL;
+ }
+
+ if (start + len > i_size_read(dest_inode)) {
+ printk(KERN_ERR "ext4 defrag: Adjust defrag length "
+ "because it[%u] is larger than destination "
+ "file size\n", len);
+ len = i_size_read(dest_inode) - start;
+ }
+ } else {
+ if (start >= i_size_read(org_inode)) {
+ printk(KERN_ERR "ext4 defrag: Start offset[%u] is "
+ "larger than original file size\n", start);
+ return -EINVAL;
+ }
+
+ if (start + len > i_size_read(org_inode)) {
+ printk(KERN_ERR "ext4 defrag: Adjust defrag length "
+ " because it[%u] is larger than original "
+ "file size\n", len);
+ len = i_size_read(org_inode) - start;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * ext4_defrag_lock_two - acquire two inodes' lock
+ *
+ * @first: inode structure to be locked first
+ * @second: inode structure to be locked second
+ *
+ * Lock mutex and semaphore of the two inodes (@first and @second). First
+ * @first is locked and then @second is locked. Note that we *must* set
+ * arguments in order of small address.
+ */
+static void ext4_defrag_lock_two(struct inode *first, struct inode *second)
+{
+ mutex_lock(&first->i_mutex);
+ down_write(&EXT4_I(first)->i_data_sem);
+
+ mutex_lock(&second->i_mutex);
+ down_write(&EXT4_I(second)->i_data_sem);
+
+ return;
+}
+
+/**
+ * ext4_defrag_unlock_two - release two inodes' lock
+ *
+ * @first: inode structure to be released its lock first
+ * @second: inode structure to be released its lock second
+ *
+ * Release mutex and semaphore of two inodes (@first and @second). First
+ * @first's lock is released and then @second's lock is released. Note that
+ * we *must* set arguments in order of big address.
+ */
+static void ext4_defrag_unlock_two(struct inode *first, struct inode *second)
+{
+ mutex_unlock(&first->i_mutex);
+ up_write(&EXT4_I(first)->i_data_sem);
+
+ mutex_unlock(&second->i_mutex);
+ up_write(&EXT4_I(second)->i_data_sem);
+
+ return;
+}
+
+/**
+ * ext4_defrag - Defrag the specified range of a file
+ *
+ * If no-option is specified, ext4_defrag() proceeds the following order.
+ * 1.ext4_defrag() calculates the block number where defrag terminates
+ * by the start block number (start) and the size of defraged data
+ * (len) specified as arguments.
+ * If the defrag_start points a hole, the extent's start offset pointed by
+ * ext_cur (current extent), holecheck_path, org_path are set after
+ * hole behind.
+ * 2.Continue step 3 to step 5, until the holecheck_path points to last_extent
+ * or the ext_cur exceeds the block_end which is last logical block number.
+ * 3.To get a length of continues area, call ext4_defrag_next_extent()
+ * specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ * until find un-continuous extent, the start logical block number exceeds
+ * the block_end or the extent points to the last extent.
+ * 4.Exchange the original inode data with destination inode data
+ * from org_page_offset to seq_end_page.
+ * The start indexes of data are specified as arguments.
+ * That of the original inode is org_page_offset,
+ * and that of the destination inode is dest_block_offset
+ * (To easily handle blocksize != pagesize case, the offset for the
+ * destination inode is block unit).
+ * 5.Update holecheck_path and org_path to points a next proceeding extent,
+ * then returns to step 2.
+ * 6.Release holecheck_path, org_path,
+ * and returns the len which is the size of defraged data.
+ * The len is used for the command to calculate the file offset
+ * where a next defrag processing start.
+ * (Since the defrag command calls defrag_ioctl() by 64MB unit,
+ * a file bigger than 64MB calls defrag_ioctl multiple times.)
+ *
+ * @o_filp: file structure of the original file
+ * @d_filp: file structure of the destination file
+ * @start: defrag start offset in block
+ * @len: defrag length in block
+ *
+ * This function returns the number of blocks if succeed, otherwise
+ * returns error value.
+ *
+ * Note that the extents of target file(@o_filp) are always converted
+ * from "uninitialized" to "initialized" after swapping the blocks
+ * between the original file and the destination one(@d_filp).
+ * In the next version, this behavior will be fixed not to change extents'
+ * initializing status.
+ */
+int
+ext4_defrag(struct file *o_filp, struct file *d_filp,
+ ext4_lblk_t block_start, ext4_lblk_t len)
+{
+ struct inode *org_inode = o_filp->f_dentry->d_inode;
+ struct inode *dest_inode = d_filp->f_dentry->d_inode;
+ struct inode *ip = org_inode, *tip = dest_inode;
+ struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL;
+ struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+ ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+ ext4_lblk_t rest_blocks;
+ pgoff_t org_page_offset, seq_end_page;
+ int ret, depth, last_extent = 0;
+ int blocks_per_page;
+ int data_offset_in_page;
+ int block_len_in_page;
+
+ /* Check the filesystem environment whether defrag can be done */
+ ret = ext4_defrag_check_arguments(org_inode, dest_inode,
+ block_start, len);
+ if (ret < 0)
+ return ret;
+
+ blocks_per_page = PAGE_CACHE_SIZE >> org_inode->i_blkbits;
+ file_end = (i_size_read(org_inode) - 1) >> org_inode->i_blkbits;
+ block_end = block_start + len - 1;
+ if (file_end < block_end)
+ len -= block_end - file_end;
+
+ /* Lock the lower address inode first */
+ if (ip > tip) {
+ ip = dest_inode;
+ tip = org_inode;
+ }
+ ext4_defrag_lock_two(ip, tip);
+
+ org_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+ if (IS_ERR(org_path)) {
+ ret = PTR_ERR(org_path);
+ org_path = NULL;
+ goto out;
+ }
+
+ /* Get path structure to check the hole */
+ holecheck_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+ if (IS_ERR(holecheck_path)) {
+ ret = PTR_ERR(holecheck_path);
+ holecheck_path = NULL;
+ goto out;
+ }
+
+ depth = ext_depth(org_inode);
+ ext_cur = holecheck_path[depth].p_ext;
+ if (ext_cur == NULL)
+ goto out;
+
+ /*
+ * Get proper extent whose ee_block is beyond block_start
+ * if block_start was within the hole.
+ */
+ if (le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ last_extent = ext4_defrag_next_extent(org_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ last_extent = ext4_defrag_next_extent(org_inode, org_path,
+ &ext_dummy);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ }
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+
+ /* No blocks within the specified range. */
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+ printk(KERN_INFO "ext4 defrag: The specified range of file"
+ " may be the hole\n");
+ goto out;
+ }
+
+ /* Adjust start blocks */
+ add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+ /* Adjust tail blocks */
+ if (seq_start + seq_blocks - 1 > block_end)
+ seq_blocks = block_end - seq_start + 1;
+
+ ext_prev = ext_cur;
+ last_extent = ext4_defrag_next_extent(org_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ break;
+ }
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+ /*
+ * Extend the length of contiguous block (seq_blocks)
+ * if extents are contiguous.
+ */
+ if (le32_to_cpu(ext_prev->ee_block) +
+ ext4_ext_get_actual_len(ext_prev) ==
+ le32_to_cpu(ext_cur->ee_block) &&
+ block_end >= le32_to_cpu(ext_cur->ee_block) &&
+ !last_extent) {
+ continue;
+ }
+
+ data_offset_in_page = seq_start % blocks_per_page;
+
+ /*
+ * Calculate data blocks count that should be swapped
+ * at the first page.
+ */
+ if (data_offset_in_page + seq_blocks > blocks_per_page) {
+ /* Swapped blocks are across pages */
+ block_len_in_page =
+ blocks_per_page - data_offset_in_page;
+ } else {
+ /* Swapped blocks are in a page */
+ block_len_in_page = seq_blocks;
+ }
+
+ org_page_offset = seq_start >>
+ (PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+ seq_end_page = (seq_start + seq_blocks - 1) >>
+ (PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ rest_blocks = seq_blocks;
+
+ /*
+ * Discard all preallocations.
+ * This is provisional solution.
+ * When true ext4_mb_return_to_preallocation() is
+ * implemented, this will be removed.
+ */
+ ext4_discard_preallocations(org_inode);
+ ext4_discard_preallocations(dest_inode);
+
+ while (org_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+ ret = ext4_defrag_partial(o_filp, dest_inode,
+ org_page_offset, data_offset_in_page,
+ block_len_in_page);
+ if (ret < 0)
+ goto out;
+
+ org_page_offset++;
+
+ data_offset_in_page = 0;
+ rest_blocks -= block_len_in_page;
+ if (rest_blocks > blocks_per_page)
+ block_len_in_page = blocks_per_page;
+ else
+ block_len_in_page = rest_blocks;
+ }
+
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+ holecheck_path = ext4_ext_find_extent(org_inode,
+ seq_start, holecheck_path);
+ if (IS_ERR(holecheck_path)) {
+ ret = PTR_ERR(holecheck_path);
+ holecheck_path = NULL;
+ break;
+ }
+ depth = holecheck_path->p_depth;
+
+ /* Decrease buffer counter */
+ if (org_path)
+ ext4_ext_drop_refs(org_path);
+ org_path = ext4_ext_find_extent(org_inode, seq_start, org_path);
+ if (IS_ERR(org_path)) {
+ ret = PTR_ERR(org_path);
+ org_path = NULL;
+ break;
+ }
+
+ ext_cur = holecheck_path[depth].p_ext;
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+ seq_blocks = 0;
+
+ }
+out:
+ if (org_path) {
+ ext4_ext_drop_refs(org_path);
+ kfree(org_path);
+ }
+ if (holecheck_path) {
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+
+ ext4_defrag_unlock_two(tip, ip);
+
+ return ret ? ret : len;
+}
@@ -293,6 +293,7 @@ struct ext4_new_group_data {
#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
#define EXT4_IOC_MIGRATE _IO('f', 9)
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_DEFRAG _IOW('f', 15, struct move_extent)
/*
* ioctl commands in 32 bit emulation
@@ -313,6 +314,13 @@ struct ext4_new_group_data {
#define EXT4_IOC_DEBUG_DELALLOC _IO('f', 42)
+struct move_extent {
+ int org_fd; /* original file descriptor */
+ int dest_fd; /* destination file descriptor */
+ ext4_lblk_t start; /* logical offset of org_fd and dest_fd*/
+ ext4_lblk_t len; /* exchange block length */
+};
+
/*
* Mount options
*/
@@ -1016,6 +1024,10 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
struct buffer_head ** bh);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+/* defrag.c */
+extern int ext4_defrag(struct file *o_filp, struct file *d_filp,
+ ext4_lblk_t start, ext4_lblk_t len);
+
/* dir.c */
extern int ext4_check_dir_entry(const char *, struct inode *,
struct ext4_dir_entry_2 *,
@@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
}
extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
@@ -49,7 +49,7 @@
* ext_pblock:
* combine low and high parts of physical block number into ext4_fsblk_t
*/
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
{
ext4_fsblk_t block;
@@ -1328,7 +1328,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
return err;
}
-static int
+int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/smp_lock.h>
#include <linux/mount.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -225,6 +226,38 @@ setversion_out:
return err;
}
+
+ case EXT4_IOC_DEFRAG: {
+ struct move_extent me;
+ struct file *d_filp;
+ int fput_needed;
+ int err;
+
+ if (copy_from_user(&me,
+ (struct move_extent __user *)arg, sizeof(me)))
+ return -EFAULT;
+
+
+ d_filp = fget_light(me.dest_fd, &fput_needed);
+ if (!d_filp)
+ return -EBADF;
+
+ if (!capable(CAP_DAC_OVERRIDE)) {
+ if ((current->real_cred->fsuid != inode->i_uid) ||
+ !(inode->i_mode & S_IRUSR) ||
+ !(d_filp->f_dentry->d_inode->i_mode & S_IRUSR)) {
+ fput_light(d_filp, fput_needed);
+ return -EACCES;
+ }
+ }
+
+ err = ext4_defrag(filp, d_filp, me.start, me.len);
+
+ fput_light(d_filp, fput_needed);
+
+ return err;
+ }
+
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
struct super_block *sb = inode->i_sb;