diff mbox series

jbd2: Avoid long replay times due to high number or revoke blocks

Message ID 20250116180223.18564-2-jack@suse.cz
State Superseded
Headers show
Series jbd2: Avoid long replay times due to high number or revoke blocks | expand

Commit Message

Jan Kara Jan. 16, 2025, 6:02 p.m. UTC
Some users are reporting journal replay takes a long time when there is
excessive number of revoke blocks in the journal. Reported times are
like:

1048576 records - 95 seconds
2097152 records - 580 seconds

The problem is that hash chains in the revoke table gets excessively
long in these cases. Fix the problem by sizing the revoke table
appropriately before the revoke pass.

Thanks to Alexey Zhuravlev <azhuravlev@ddn.com> for benchmarking the patch with
large numbers of revoke blocks [1].

[1] https://lore.kernel.org/all/20250113183107.7bfef7b6@x390.bzzz77.ru

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/recovery.c   | 54 +++++++++++++++++++++++++++++++++++++-------
 fs/jbd2/revoke.c     |  8 +++----
 include/linux/jbd2.h |  2 ++
 3 files changed, 52 insertions(+), 12 deletions(-)

Comments

Zhang Yi Jan. 17, 2025, 2:58 a.m. UTC | #1
On 2025/1/17 2:02, Jan Kara wrote:
> Some users are reporting journal replay takes a long time when there is
> excessive number of revoke blocks in the journal. Reported times are
> like:
> 
> 1048576 records - 95 seconds
> 2097152 records - 580 seconds
> 
> The problem is that hash chains in the revoke table gets excessively
> long in these cases. Fix the problem by sizing the revoke table
> appropriately before the revoke pass.
> 
> Thanks to Alexey Zhuravlev <azhuravlev@ddn.com> for benchmarking the patch with
> large numbers of revoke blocks [1].
> 
> [1] https://lore.kernel.org/all/20250113183107.7bfef7b6@x390.bzzz77.ru
> 
> Signed-off-by: Jan Kara <jack@suse.cz>

Hi, Jan,

This overall patch looks good to me; however, it appears to be not
based on the latested version of the upstream kernel, and I have one
minor suggestion below.

> ---
>  fs/jbd2/recovery.c   | 54 +++++++++++++++++++++++++++++++++++++-------
>  fs/jbd2/revoke.c     |  8 +++----
>  include/linux/jbd2.h |  2 ++
>  3 files changed, 52 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
> index 667f67342c52..9845f72e456a 100644
> --- a/fs/jbd2/recovery.c
> +++ b/fs/jbd2/recovery.c
> @@ -39,7 +39,7 @@ struct recovery_info
>  
>  static int do_one_pass(journal_t *journal,
>  				struct recovery_info *info, enum passtype pass);
> -static int scan_revoke_records(journal_t *, struct buffer_head *,
> +static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *,
>  				tid_t, struct recovery_info *);
>  
>  #ifdef __KERNEL__
> @@ -327,6 +327,12 @@ int jbd2_journal_recover(journal_t *journal)
>  		  journal->j_transaction_sequence, journal->j_head);
>  
>  	jbd2_journal_clear_revoke(journal);
> +	/* Free revoke table allocated for replay */
> +	if (journal->j_revoke != journal->j_revoke_table[0] &&
> +	    journal->j_revoke != journal->j_revoke_table[1]) {
> +		jbd2_journal_destroy_revoke_table(journal->j_revoke);
> +		journal->j_revoke = journal->j_revoke_table[1];
> +	}
>  	err2 = sync_blockdev(journal->j_fs_dev);
>  	if (!err)
>  		err = err2;
> @@ -517,6 +523,31 @@ static int do_one_pass(journal_t *journal,
>  	first_commit_ID = next_commit_ID;
>  	if (pass == PASS_SCAN)
>  		info->start_transaction = first_commit_ID;
> +	else if (pass == PASS_REVOKE) {
> +		/*
> +		 * Would the default revoke table have too long hash chains
> +		 * during replay?
> +		 */
> +		if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) {
> +			unsigned int hash_size;
> +
> +			/*
> +			 * Aim for average chain length of 8, limit at 1M
> +			 * entries to avoid problems with malicious
> +			 * filesystems.
> +			 */
> +			hash_size = min(roundup_pow_of_two(info->nr_revokes / 8),
> +					1U << 20);
> +			journal->j_revoke =
> +				jbd2_journal_init_revoke_table(hash_size);
> +			if (!journal->j_revoke) {
> +				printk(KERN_ERR
> +				       "JBD2: failed to allocate revoke table for replay with %u entries. "
> +				       "Journal replay may be slow.\n", hash_size);
> +				journal->j_revoke = journal->j_revoke_table[1];
> +			}
> +		}
> +	}
>  
>  	jbd2_debug(1, "Starting recovery pass %d\n", pass);
>  
> @@ -874,14 +905,16 @@ static int do_one_pass(journal_t *journal,
>  				need_check_commit_time = true;
>  			}
>  
> -			/* If we aren't in the REVOKE pass, then we can
> -			 * just skip over this block. */
> -			if (pass != PASS_REVOKE) {
> +			/*
> +			 * If we aren't in the SCAN or REVOKE pass, then we can
> +			 * just skip over this block.
> +			 */
> +			if (pass != PASS_REVOKE && pass != PASS_SCAN) {
>  				brelse(bh);
>  				continue;
>  			}

How about move this code snippets to the beginning of the
JBD2_REVOKE_BLOCK branch case?

Thanks,
Yi.


>  
>  extern void	   jbd2_journal_destroy_revoke(journal_t *);
>  extern int	   jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
Jan Kara Jan. 17, 2025, 11:11 a.m. UTC | #2
On Fri 17-01-25 10:58:42, Zhang Yi wrote:
> On 2025/1/17 2:02, Jan Kara wrote:
> > Some users are reporting journal replay takes a long time when there is
> > excessive number of revoke blocks in the journal. Reported times are
> > like:
> > 
> > 1048576 records - 95 seconds
> > 2097152 records - 580 seconds
> > 
> > The problem is that hash chains in the revoke table gets excessively
> > long in these cases. Fix the problem by sizing the revoke table
> > appropriately before the revoke pass.
> > 
> > Thanks to Alexey Zhuravlev <azhuravlev@ddn.com> for benchmarking the patch with
> > large numbers of revoke blocks [1].
> > 
> > [1] https://lore.kernel.org/all/20250113183107.7bfef7b6@x390.bzzz77.ru
> > 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> 
> Hi, Jan,
> 
> This overall patch looks good to me; however, it appears to be not
> based on the latested version of the upstream kernel, and I have one
> minor suggestion below.

OK, will do.

> > @@ -874,14 +905,16 @@ static int do_one_pass(journal_t *journal,
> >  				need_check_commit_time = true;
> >  			}
> >  
> > -			/* If we aren't in the REVOKE pass, then we can
> > -			 * just skip over this block. */
> > -			if (pass != PASS_REVOKE) {
> > +			/*
> > +			 * If we aren't in the SCAN or REVOKE pass, then we can
> > +			 * just skip over this block.
> > +			 */
> > +			if (pass != PASS_REVOKE && pass != PASS_SCAN) {
> >  				brelse(bh);
> >  				continue;
> >  			}
> 
> How about move this code snippets to the beginning of the
> JBD2_REVOKE_BLOCK branch case?

I guess a good idea after the change. Thanks!

								Honza
diff mbox series

Patch

diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 667f67342c52..9845f72e456a 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -39,7 +39,7 @@  struct recovery_info
 
 static int do_one_pass(journal_t *journal,
 				struct recovery_info *info, enum passtype pass);
-static int scan_revoke_records(journal_t *, struct buffer_head *,
+static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *,
 				tid_t, struct recovery_info *);
 
 #ifdef __KERNEL__
@@ -327,6 +327,12 @@  int jbd2_journal_recover(journal_t *journal)
 		  journal->j_transaction_sequence, journal->j_head);
 
 	jbd2_journal_clear_revoke(journal);
+	/* Free revoke table allocated for replay */
+	if (journal->j_revoke != journal->j_revoke_table[0] &&
+	    journal->j_revoke != journal->j_revoke_table[1]) {
+		jbd2_journal_destroy_revoke_table(journal->j_revoke);
+		journal->j_revoke = journal->j_revoke_table[1];
+	}
 	err2 = sync_blockdev(journal->j_fs_dev);
 	if (!err)
 		err = err2;
@@ -517,6 +523,31 @@  static int do_one_pass(journal_t *journal,
 	first_commit_ID = next_commit_ID;
 	if (pass == PASS_SCAN)
 		info->start_transaction = first_commit_ID;
+	else if (pass == PASS_REVOKE) {
+		/*
+		 * Would the default revoke table have too long hash chains
+		 * during replay?
+		 */
+		if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) {
+			unsigned int hash_size;
+
+			/*
+			 * Aim for average chain length of 8, limit at 1M
+			 * entries to avoid problems with malicious
+			 * filesystems.
+			 */
+			hash_size = min(roundup_pow_of_two(info->nr_revokes / 8),
+					1U << 20);
+			journal->j_revoke =
+				jbd2_journal_init_revoke_table(hash_size);
+			if (!journal->j_revoke) {
+				printk(KERN_ERR
+				       "JBD2: failed to allocate revoke table for replay with %u entries. "
+				       "Journal replay may be slow.\n", hash_size);
+				journal->j_revoke = journal->j_revoke_table[1];
+			}
+		}
+	}
 
 	jbd2_debug(1, "Starting recovery pass %d\n", pass);
 
@@ -874,14 +905,16 @@  static int do_one_pass(journal_t *journal,
 				need_check_commit_time = true;
 			}
 
-			/* If we aren't in the REVOKE pass, then we can
-			 * just skip over this block. */
-			if (pass != PASS_REVOKE) {
+			/*
+			 * If we aren't in the SCAN or REVOKE pass, then we can
+			 * just skip over this block.
+			 */
+			if (pass != PASS_REVOKE && pass != PASS_SCAN) {
 				brelse(bh);
 				continue;
 			}
 
-			err = scan_revoke_records(journal, bh,
+			err = scan_revoke_records(journal, pass, bh,
 						  next_commit_ID, info);
 			brelse(bh);
 			if (err)
@@ -937,8 +970,9 @@  static int do_one_pass(journal_t *journal,
 
 /* Scan a revoke record, marking all blocks mentioned as revoked. */
 
-static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
-			       tid_t sequence, struct recovery_info *info)
+static int scan_revoke_records(journal_t *journal, enum passtype pass,
+			       struct buffer_head *bh, tid_t sequence,
+			       struct recovery_info *info)
 {
 	jbd2_journal_revoke_header_t *header;
 	int offset, max;
@@ -959,6 +993,11 @@  static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 	if (jbd2_has_feature_64bit(journal))
 		record_len = 8;
 
+	if (pass == PASS_SCAN) {
+		info->nr_revokes += (max - offset) / record_len;
+		return 0;
+	}
+
 	while (offset + record_len <= max) {
 		unsigned long long blocknr;
 		int err;
@@ -971,7 +1010,6 @@  static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 		err = jbd2_journal_set_revoke(journal, blocknr, sequence);
 		if (err)
 			return err;
-		++info->nr_revokes;
 	}
 	return 0;
 }
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 4556e4689024..f4ac308e84c5 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -215,7 +215,7 @@  int __init jbd2_journal_init_revoke_table_cache(void)
 	return 0;
 }
 
-static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
+struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
 {
 	int shift = 0;
 	int tmp = hash_size;
@@ -231,7 +231,7 @@  static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
 	table->hash_size = hash_size;
 	table->hash_shift = shift;
 	table->hash_table =
-		kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
+		kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
 	if (!table->hash_table) {
 		kmem_cache_free(jbd2_revoke_table_cache, table);
 		table = NULL;
@@ -245,7 +245,7 @@  static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
 	return table;
 }
 
-static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
+void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
 {
 	int i;
 	struct list_head *hash_list;
@@ -255,7 +255,7 @@  static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
 		J_ASSERT(list_empty(hash_list));
 	}
 
-	kfree(table->hash_table);
+	kvfree(table->hash_table);
 	kmem_cache_free(jbd2_revoke_table_cache, table);
 }
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 8aef9bb6ad57..781615214d47 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1634,6 +1634,8 @@  extern void	   jbd2_journal_destroy_revoke_record_cache(void);
 extern void	   jbd2_journal_destroy_revoke_table_cache(void);
 extern int __init jbd2_journal_init_revoke_record_cache(void);
 extern int __init jbd2_journal_init_revoke_table_cache(void);
+struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size);
+void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table);
 
 extern void	   jbd2_journal_destroy_revoke(journal_t *);
 extern int	   jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);