Message ID | 20220308163319.1183625-4-harshads@google.com |
---|---|
State | Superseded |
Headers | show |
Series | ext4: improve commit path performance for fast commit | expand |
On Tue 08-03-22 08:33:17, Harshad Shirwadkar wrote: > From: Harshad Shirwadkar <harshadshirwadkar@gmail.com> > > This patch reworks fast commit's commit path to remove locking the > journal for the entire duration of a fast commit. Instead, we only lock > the journal while marking all the eligible inodes as "committing". This > allows handles to make progress in parallel with the fast commit. > > Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com> The patch looks good. Feel free to add: Reviewed-by: Jan Kara <jack@suse.cz> Honza > --- > fs/ext4/fast_commit.c | 77 ++++++++++++++++++++++++++----------------- > fs/jbd2/journal.c | 2 -- > 2 files changed, 47 insertions(+), 32 deletions(-) > > diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c > index be8c5b3456ec..eedcf8b4d47b 100644 > --- a/fs/ext4/fast_commit.c > +++ b/fs/ext4/fast_commit.c > @@ -287,20 +287,30 @@ void ext4_fc_del(struct inode *inode) > (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) > return; > > -restart: > spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); > if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { > spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); > return; > } > > - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { > - ext4_fc_wait_committing_inode(inode); > - goto restart; > - } > - > - if (!list_empty(&ei->i_fc_list)) > - list_del_init(&ei->i_fc_list); > + /* > + * Since ext4_fc_del is called from ext4_evict_inode while having a > + * handle open, there is no need for us to wait here even if a fast > + * commit is going on. That is because, if this inode is being > + * committed, ext4_mark_inode_dirty would have waited for inode commit > + * operation to finish before we come here. So, by the time we come > + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, > + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode > + * here. > + * > + * We may come here without any handles open in the "no_delete" case of > + * ext4_evict_inode as well. However, if that happens, we first mark the > + * file system as fast commit ineligible anyway. So, even in that case, > + * it is okay to remove the inode from the fc list. > + */ > + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) > + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); > + list_del_init(&ei->i_fc_list); > > /* > * Since this inode is getting removed, let's also remove all FC > @@ -323,8 +333,6 @@ void ext4_fc_del(struct inode *inode) > fc_dentry->fcd_name.len > DNAME_INLINE_LEN) > kfree(fc_dentry->fcd_name.name); > kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); > - > - return; > } > > /* > @@ -964,19 +972,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) > > spin_lock(&sbi->s_fc_lock); > list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { > - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); > - while (atomic_read(&ei->i_fc_updates)) { > - DEFINE_WAIT(wait); > - > - prepare_to_wait(&ei->i_fc_wait, &wait, > - TASK_UNINTERRUPTIBLE); > - if (atomic_read(&ei->i_fc_updates)) { > - spin_unlock(&sbi->s_fc_lock); > - schedule(); > - spin_lock(&sbi->s_fc_lock); > - } > - finish_wait(&ei->i_fc_wait, &wait); > - } > spin_unlock(&sbi->s_fc_lock); > ret = jbd2_submit_inode_data(ei->jinode); > if (ret) > @@ -998,13 +993,9 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal) > > spin_lock(&sbi->s_fc_lock); > list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { > - spin_lock(&pos->i_fc_lock); > if (!ext4_test_inode_state(&pos->vfs_inode, > - EXT4_STATE_FC_COMMITTING)) { > - spin_unlock(&pos->i_fc_lock); > + EXT4_STATE_FC_COMMITTING)) > continue; > - } > - spin_unlock(&pos->i_fc_lock); > spin_unlock(&sbi->s_fc_lock); > > ret = jbd2_wait_inode_data(journal, pos->jinode); > @@ -1093,6 +1084,16 @@ static int ext4_fc_perform_commit(journal_t *journal) > int ret = 0; > u32 crc = 0; > > + /* Lock the journal */ > + jbd2_journal_lock_updates(journal); > + spin_lock(&sbi->s_fc_lock); > + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { > + ext4_set_inode_state(&iter->vfs_inode, > + EXT4_STATE_FC_COMMITTING); > + } > + spin_unlock(&sbi->s_fc_lock); > + jbd2_journal_unlock_updates(journal); > + > ret = ext4_fc_submit_inode_data_all(journal); > if (ret) > return ret; > @@ -1143,6 +1144,18 @@ static int ext4_fc_perform_commit(journal_t *journal) > ret = ext4_fc_write_inode(inode, &crc); > if (ret) > goto out; > + ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); > + /* > + * Make sure clearing of EXT4_STATE_FC_COMMITTING is > + * visible before we send the wakeup. Pairs with implicit > + * barrier in prepare_to_wait() in ext4_fc_track_inode(). > + */ > + smp_mb(); > +#if (BITS_PER_LONG < 64) > + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); > +#else > + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); > +#endif > spin_lock(&sbi->s_fc_lock); > } > spin_unlock(&sbi->s_fc_lock); > @@ -1276,13 +1289,17 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) > spin_lock(&sbi->s_fc_lock); > list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], > i_fc_list) { > - list_del_init(&iter->i_fc_list); > ext4_clear_inode_state(&iter->vfs_inode, > EXT4_STATE_FC_COMMITTING); > if (iter->i_sync_tid <= tid) > ext4_fc_reset_inode(&iter->vfs_inode); > - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ > + /* > + * Make sure clearing of EXT4_STATE_FC_COMMITTING is > + * visible before we send the wakeup. Pairs with implicit > + * barrier in prepare_to_wait() in ext4_fc_track_inode(). > + */ > smp_mb(); > + list_del_init(&iter->i_fc_list); > #if (BITS_PER_LONG < 64) > wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); > #else > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c > index c2cf74b01ddb..06b885628b1c 100644 > --- a/fs/jbd2/journal.c > +++ b/fs/jbd2/journal.c > @@ -757,7 +757,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) > } > journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; > write_unlock(&journal->j_state_lock); > - jbd2_journal_lock_updates(journal); > > return 0; > } > @@ -769,7 +768,6 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit); > */ > static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) > { > - jbd2_journal_unlock_updates(journal); > if (journal->j_fc_cleanup_callback) > journal->j_fc_cleanup_callback(journal, 0, tid); > write_lock(&journal->j_state_lock); > -- > 2.35.1.616.g0bdcbb4464-goog >
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index be8c5b3456ec..eedcf8b4d47b 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -287,20 +287,30 @@ void ext4_fc_del(struct inode *inode) (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) return; -restart: spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } - - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC @@ -323,8 +333,6 @@ void ext4_fc_del(struct inode *inode) fc_dentry->fcd_name.len > DNAME_INLINE_LEN) kfree(fc_dentry->fcd_name.name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - - return; } /* @@ -964,19 +972,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(ei->jinode); if (ret) @@ -998,13 +993,9 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal) spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - spin_lock(&pos->i_fc_lock); if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) { - spin_unlock(&pos->i_fc_lock); + EXT4_STATE_FC_COMMITTING)) continue; - } - spin_unlock(&pos->i_fc_lock); spin_unlock(&sbi->s_fc_lock); ret = jbd2_wait_inode_data(journal, pos->jinode); @@ -1093,6 +1084,16 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; + /* Lock the journal */ + jbd2_journal_lock_updates(journal); + spin_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + spin_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + ret = ext4_fc_submit_inode_data_all(journal); if (ret) return ret; @@ -1143,6 +1144,18 @@ static int ext4_fc_perform_commit(journal_t *journal) ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; + ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ + smp_mb(); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); +#endif spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); @@ -1276,13 +1289,17 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); if (iter->i_sync_tid <= tid) ext4_fc_reset_inode(&iter->vfs_inode); - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); + list_del_init(&iter->i_fc_list); #if (BITS_PER_LONG < 64) wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); #else diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c2cf74b01ddb..06b885628b1c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -757,7 +757,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); - jbd2_journal_lock_updates(journal); return 0; } @@ -769,7 +768,6 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit); */ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { - jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock);