diff mbox series

[v2,3/5] ext4: rework fast commit commit path

Message ID 20220308163319.1183625-4-harshads@google.com
State Superseded
Headers show
Series ext4: improve commit path performance for fast commit | expand

Commit Message

harshad shirwadkar March 8, 2022, 4:33 p.m. UTC
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>

This patch reworks fast commit's commit path to remove locking the
journal for the entire duration of a fast commit. Instead, we only lock
the journal while marking all the eligible inodes as "committing". This
allows handles to make progress in parallel with the fast commit.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/fast_commit.c | 77 ++++++++++++++++++++++++++-----------------
 fs/jbd2/journal.c     |  2 --
 2 files changed, 47 insertions(+), 32 deletions(-)

Comments

Jan Kara March 9, 2022, 10:17 a.m. UTC | #1
On Tue 08-03-22 08:33:17, Harshad Shirwadkar wrote:
> From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> 
> This patch reworks fast commit's commit path to remove locking the
> journal for the entire duration of a fast commit. Instead, we only lock
> the journal while marking all the eligible inodes as "committing". This
> allows handles to make progress in parallel with the fast commit.
> 
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>

The patch looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza


> ---
>  fs/ext4/fast_commit.c | 77 ++++++++++++++++++++++++++-----------------
>  fs/jbd2/journal.c     |  2 --
>  2 files changed, 47 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index be8c5b3456ec..eedcf8b4d47b 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -287,20 +287,30 @@ void ext4_fc_del(struct inode *inode)
>  	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
>  		return;
>  
> -restart:
>  	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
>  	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
>  		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
>  		return;
>  	}
>  
> -	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
> -		ext4_fc_wait_committing_inode(inode);
> -		goto restart;
> -	}
> -
> -	if (!list_empty(&ei->i_fc_list))
> -		list_del_init(&ei->i_fc_list);
> +	/*
> +	 * Since ext4_fc_del is called from ext4_evict_inode while having a
> +	 * handle open, there is no need for us to wait here even if a fast
> +	 * commit is going on. That is because, if this inode is being
> +	 * committed, ext4_mark_inode_dirty would have waited for inode commit
> +	 * operation to finish before we come here. So, by the time we come
> +	 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
> +	 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
> +	 * here.
> +	 *
> +	 * We may come here without any handles open in the "no_delete" case of
> +	 * ext4_evict_inode as well. However, if that happens, we first mark the
> +	 * file system as fast commit ineligible anyway. So, even in that case,
> +	 * it is okay to remove the inode from the fc list.
> +	 */
> +	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
> +		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
> +	list_del_init(&ei->i_fc_list);
>  
>  	/*
>  	 * Since this inode is getting removed, let's also remove all FC
> @@ -323,8 +333,6 @@ void ext4_fc_del(struct inode *inode)
>  		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
>  		kfree(fc_dentry->fcd_name.name);
>  	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
> -
> -	return;
>  }
>  
>  /*
> @@ -964,19 +972,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
>  
>  	spin_lock(&sbi->s_fc_lock);
>  	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> -		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
> -		while (atomic_read(&ei->i_fc_updates)) {
> -			DEFINE_WAIT(wait);
> -
> -			prepare_to_wait(&ei->i_fc_wait, &wait,
> -						TASK_UNINTERRUPTIBLE);
> -			if (atomic_read(&ei->i_fc_updates)) {
> -				spin_unlock(&sbi->s_fc_lock);
> -				schedule();
> -				spin_lock(&sbi->s_fc_lock);
> -			}
> -			finish_wait(&ei->i_fc_wait, &wait);
> -		}
>  		spin_unlock(&sbi->s_fc_lock);
>  		ret = jbd2_submit_inode_data(ei->jinode);
>  		if (ret)
> @@ -998,13 +993,9 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal)
>  
>  	spin_lock(&sbi->s_fc_lock);
>  	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> -		spin_lock(&pos->i_fc_lock);
>  		if (!ext4_test_inode_state(&pos->vfs_inode,
> -					   EXT4_STATE_FC_COMMITTING)) {
> -			spin_unlock(&pos->i_fc_lock);
> +					   EXT4_STATE_FC_COMMITTING))
>  			continue;
> -		}
> -		spin_unlock(&pos->i_fc_lock);
>  		spin_unlock(&sbi->s_fc_lock);
>  
>  		ret = jbd2_wait_inode_data(journal, pos->jinode);
> @@ -1093,6 +1084,16 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  	int ret = 0;
>  	u32 crc = 0;
>  
> +	/* Lock the journal */
> +	jbd2_journal_lock_updates(journal);
> +	spin_lock(&sbi->s_fc_lock);
> +	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
> +		ext4_set_inode_state(&iter->vfs_inode,
> +				     EXT4_STATE_FC_COMMITTING);
> +	}
> +	spin_unlock(&sbi->s_fc_lock);
> +	jbd2_journal_unlock_updates(journal);
> +
>  	ret = ext4_fc_submit_inode_data_all(journal);
>  	if (ret)
>  		return ret;
> @@ -1143,6 +1144,18 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  		ret = ext4_fc_write_inode(inode, &crc);
>  		if (ret)
>  			goto out;
> +		ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
> +		/*
> +		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
> +		 * visible before we send the wakeup. Pairs with implicit
> +		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
> +		 */
> +		smp_mb();
> +#if (BITS_PER_LONG < 64)
> +		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
> +#else
> +		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
> +#endif
>  		spin_lock(&sbi->s_fc_lock);
>  	}
>  	spin_unlock(&sbi->s_fc_lock);
> @@ -1276,13 +1289,17 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
>  	spin_lock(&sbi->s_fc_lock);
>  	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
>  				 i_fc_list) {
> -		list_del_init(&iter->i_fc_list);
>  		ext4_clear_inode_state(&iter->vfs_inode,
>  				       EXT4_STATE_FC_COMMITTING);
>  		if (iter->i_sync_tid <= tid)
>  			ext4_fc_reset_inode(&iter->vfs_inode);
> -		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
> +		/*
> +		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
> +		 * visible before we send the wakeup. Pairs with implicit
> +		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
> +		 */
>  		smp_mb();
> +		list_del_init(&iter->i_fc_list);
>  #if (BITS_PER_LONG < 64)
>  		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
>  #else
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index c2cf74b01ddb..06b885628b1c 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -757,7 +757,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
>  	}
>  	journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
>  	write_unlock(&journal->j_state_lock);
> -	jbd2_journal_lock_updates(journal);
>  
>  	return 0;
>  }
> @@ -769,7 +768,6 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit);
>   */
>  static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
>  {
> -	jbd2_journal_unlock_updates(journal);
>  	if (journal->j_fc_cleanup_callback)
>  		journal->j_fc_cleanup_callback(journal, 0, tid);
>  	write_lock(&journal->j_state_lock);
> -- 
> 2.35.1.616.g0bdcbb4464-goog
>
diff mbox series

Patch

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index be8c5b3456ec..eedcf8b4d47b 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -287,20 +287,30 @@  void ext4_fc_del(struct inode *inode)
 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 		return;
 
-restart:
 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 		return;
 	}
 
-	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-		ext4_fc_wait_committing_inode(inode);
-		goto restart;
-	}
-
-	if (!list_empty(&ei->i_fc_list))
-		list_del_init(&ei->i_fc_list);
+	/*
+	 * Since ext4_fc_del is called from ext4_evict_inode while having a
+	 * handle open, there is no need for us to wait here even if a fast
+	 * commit is going on. That is because, if this inode is being
+	 * committed, ext4_mark_inode_dirty would have waited for inode commit
+	 * operation to finish before we come here. So, by the time we come
+	 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+	 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+	 * here.
+	 *
+	 * We may come here without any handles open in the "no_delete" case of
+	 * ext4_evict_inode as well. However, if that happens, we first mark the
+	 * file system as fast commit ineligible anyway. So, even in that case,
+	 * it is okay to remove the inode from the fc list.
+	 */
+	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+	list_del_init(&ei->i_fc_list);
 
 	/*
 	 * Since this inode is getting removed, let's also remove all FC
@@ -323,8 +333,6 @@  void ext4_fc_del(struct inode *inode)
 		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
 		kfree(fc_dentry->fcd_name.name);
 	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
-
-	return;
 }
 
 /*
@@ -964,19 +972,6 @@  static int ext4_fc_submit_inode_data_all(journal_t *journal)
 
 	spin_lock(&sbi->s_fc_lock);
 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
-		while (atomic_read(&ei->i_fc_updates)) {
-			DEFINE_WAIT(wait);
-
-			prepare_to_wait(&ei->i_fc_wait, &wait,
-						TASK_UNINTERRUPTIBLE);
-			if (atomic_read(&ei->i_fc_updates)) {
-				spin_unlock(&sbi->s_fc_lock);
-				schedule();
-				spin_lock(&sbi->s_fc_lock);
-			}
-			finish_wait(&ei->i_fc_wait, &wait);
-		}
 		spin_unlock(&sbi->s_fc_lock);
 		ret = jbd2_submit_inode_data(ei->jinode);
 		if (ret)
@@ -998,13 +993,9 @@  static int ext4_fc_wait_inode_data_all(journal_t *journal)
 
 	spin_lock(&sbi->s_fc_lock);
 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-		spin_lock(&pos->i_fc_lock);
 		if (!ext4_test_inode_state(&pos->vfs_inode,
-					   EXT4_STATE_FC_COMMITTING)) {
-			spin_unlock(&pos->i_fc_lock);
+					   EXT4_STATE_FC_COMMITTING))
 			continue;
-		}
-		spin_unlock(&pos->i_fc_lock);
 		spin_unlock(&sbi->s_fc_lock);
 
 		ret = jbd2_wait_inode_data(journal, pos->jinode);
@@ -1093,6 +1084,16 @@  static int ext4_fc_perform_commit(journal_t *journal)
 	int ret = 0;
 	u32 crc = 0;
 
+	/* Lock the journal */
+	jbd2_journal_lock_updates(journal);
+	spin_lock(&sbi->s_fc_lock);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+		ext4_set_inode_state(&iter->vfs_inode,
+				     EXT4_STATE_FC_COMMITTING);
+	}
+	spin_unlock(&sbi->s_fc_lock);
+	jbd2_journal_unlock_updates(journal);
+
 	ret = ext4_fc_submit_inode_data_all(journal);
 	if (ret)
 		return ret;
@@ -1143,6 +1144,18 @@  static int ext4_fc_perform_commit(journal_t *journal)
 		ret = ext4_fc_write_inode(inode, &crc);
 		if (ret)
 			goto out;
+		ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+		/*
+		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+		 * visible before we send the wakeup. Pairs with implicit
+		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
+		 */
+		smp_mb();
+#if (BITS_PER_LONG < 64)
+		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+#else
+		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+#endif
 		spin_lock(&sbi->s_fc_lock);
 	}
 	spin_unlock(&sbi->s_fc_lock);
@@ -1276,13 +1289,17 @@  static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 	spin_lock(&sbi->s_fc_lock);
 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
 				 i_fc_list) {
-		list_del_init(&iter->i_fc_list);
 		ext4_clear_inode_state(&iter->vfs_inode,
 				       EXT4_STATE_FC_COMMITTING);
 		if (iter->i_sync_tid <= tid)
 			ext4_fc_reset_inode(&iter->vfs_inode);
-		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+		/*
+		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+		 * visible before we send the wakeup. Pairs with implicit
+		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
+		 */
 		smp_mb();
+		list_del_init(&iter->i_fc_list);
 #if (BITS_PER_LONG < 64)
 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
 #else
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c2cf74b01ddb..06b885628b1c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -757,7 +757,6 @@  int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
 	}
 	journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
 	write_unlock(&journal->j_state_lock);
-	jbd2_journal_lock_updates(journal);
 
 	return 0;
 }
@@ -769,7 +768,6 @@  EXPORT_SYMBOL(jbd2_fc_begin_commit);
  */
 static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
 {
-	jbd2_journal_unlock_updates(journal);
 	if (journal->j_fc_cleanup_callback)
 		journal->j_fc_cleanup_callback(journal, 0, tid);
 	write_lock(&journal->j_state_lock);