diff mbox series

[RFC] improve malloc for large filesystems

Message ID 8738E8FF-820F-48A5-9150-7FF64219ED42@whamcloud.com
State New
Headers show
Series [RFC] improve malloc for large filesystems | expand

Commit Message

Alex Zhuravlev Nov. 20, 2019, 10:35 a.m. UTC
Hi,

We’ve seen few reports where a huge fragmented filesystem spends a lot of time looking for a good chunks of free space.
Two issues have been identified so far:
1) mballoc tries too hard to find the best chunk which is counterproductive - it makes sense to limit this process
2) during scanning the bitmaps are loaded one by one, synchronously  - it makes sense to prefetch few groups at once
Here is a patch for comments, not really tested at scale, but it’d be great to see the comments.

Thanks in advance, Alex

Comments

Artem Blagodarenko Nov. 20, 2019, 11:56 a.m. UTC | #1
Hello Alex,

Thank you for this important topic. I am going to test your patch and give feedback. But this requires some time.

Now I want to share my thougths about this topic.
Here are slides from LAD2019 “Ldiskfs block allocator and aged file system" https://www.eofs.eu/_media/events/lad19/14_artem_blagodarenko-ldiskfs_block_allocator.pdf
There is also patch https://lists.openwall.net/linux-ext4/2019/03/11/5 that was sent here, but it require to be improved.

Best  regards,
Artem Blagodarenko

> On 20 Nov 2019, at 13:35, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
> 
> Hi,
> 
> We’ve seen few reports where a huge fragmented filesystem spends a lot of time looking for a good chunks of free space.
> Two issues have been identified so far:
> 1) mballoc tries too hard to find the best chunk which is counterproductive - it makes sense to limit this process
> 2) during scanning the bitmaps are loaded one by one, synchronously  - it makes sense to prefetch few groups at once
> Here is a patch for comments, not really tested at scale, but it’d be great to see the comments.
> 
> Thanks in advance, Alex
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 0b202e00d93f..76547601384b 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -404,7 +404,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>  * Return buffer_head on success or NULL in case of failure.
>  */
> struct buffer_head *
> -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
> +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
> +				 int ignore_locked)
> {
> 	struct ext4_group_desc *desc;
> 	struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -435,6 +436,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
> 	if (bitmap_uptodate(bh))
> 		goto verify;
> 
> +	if (ignore_locked && buffer_locked(bh)) {
> +		/* buffer under IO already, do not wait
> +		 * if called for prefetching */
> +		err = 0;
> +		goto out;
> +	}
> +
> 	lock_buffer(bh);
> 	if (bitmap_uptodate(bh)) {
> 		unlock_buffer(bh);
> @@ -524,7 +532,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
> 	struct buffer_head *bh;
> 	int err;
> 
> -	bh = ext4_read_block_bitmap_nowait(sb, block_group);
> +	bh = ext4_read_block_bitmap_nowait(sb, block_group, 1);
> 	if (IS_ERR(bh))
> 		return bh;
> 	err = ext4_wait_block_bitmap(sb, block_group, bh);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 03db3e71676c..2320d7e2f8d6 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1480,6 +1480,9 @@ struct ext4_sb_info {
> 	/* where last allocation was done - for stream allocation */
> 	unsigned long s_mb_last_group;
> 	unsigned long s_mb_last_start;
> +	unsigned int s_mb_toscan0;
> +	unsigned int s_mb_toscan1;
> +	unsigned int s_mb_prefetch;
> 
> 	/* stats for buddy allocator */
> 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
> @@ -2333,7 +2336,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
> extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
> 
> extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
> -						ext4_group_t block_group);
> +						ext4_group_t block_group,
> +						int ignore_locked);
> extern int ext4_wait_block_bitmap(struct super_block *sb,
> 				  ext4_group_t block_group,
> 				  struct buffer_head *bh);
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index a3e2767bdf2f..eac4ee225527 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
> 			bh[i] = NULL;
> 			continue;
> 		}
> -		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
> +		bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0);
> 		if (IS_ERR(bh[i])) {
> 			err = PTR_ERR(bh[i]);
> 			bh[i] = NULL;
> @@ -2095,10 +2095,52 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
> 	return 0;
> }
> 
> +/*
> + * each allocation context (i.e. a thread doing allocation) has own
> + * sliding prefetch window of @s_mb_prefetch size which starts at the
> + * very first goal and moves ahead of scaning.
> + * a side effect is that subsequent allocations will likely find
> + * the bitmaps in cache or at least in-flight.
> + */
> +static void
> +ext4_mb_prefetch(struct ext4_allocation_context *ac,
> +		    ext4_group_t start)
> +{
> +	ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
> +	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> +	struct ext4_group_info *grp;
> +	ext4_group_t group = start;
> +	struct buffer_head *bh;
> +	int nr;
> +
> +	/* batch prefetching to get few READs in flight */
> +	if (group + (sbi->s_mb_prefetch >> 1) < ac->ac_prefetch)
> +		return;
> +
> +	nr = sbi->s_mb_prefetch;
> +	while (nr > 0) {
> +		if (++group >= ngroups)
> +			group = 0;
> +		if (unlikely(group == start))
> +			break;
> +		grp = ext4_get_group_info(ac->ac_sb, group);
> +		/* ignore empty groups - those will be skipped
> +		 * during the scanning as well */
> +		if (grp->bb_free == 0)
> +			continue;
> +		nr--;
> +		if (!EXT4_MB_GRP_NEED_INIT(grp))
> +			continue;
> +		bh = ext4_read_block_bitmap_nowait(ac->ac_sb, group, 1);
> +		brelse(bh);
> +	}
> +	ac->ac_prefetch = group;
> +}
> +
> static noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> -	ext4_group_t ngroups, group, i;
> +	ext4_group_t ngroups, toscan, group, i;
> 	int cr;
> 	int err = 0, first_err = 0;
> 	struct ext4_sb_info *sbi;
> @@ -2160,6 +2202,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> 	 * cr == 0 try to get exact allocation,
> 	 * cr == 3  try to get anything
> 	 */
> +
> +	ac->ac_prefetch = ac->ac_g_ex.fe_group;
> +
> repeat:
> 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
> 		ac->ac_criteria = cr;
> @@ -2169,7 +2214,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> 		 */
> 		group = ac->ac_g_ex.fe_group;
> 
> -		for (i = 0; i < ngroups; group++, i++) {
> +		/* limit number of groups to scan at the first two rounds
> +		 * when we hope to find something really good */
> +		toscan = ngroups;
> +		if (cr == 0)
> +			toscan = sbi->s_mb_toscan0;
> +		else if (cr == 1)
> +			toscan = sbi->s_mb_toscan1;
> +
> +		for (i = 0; i < toscan; group++, i++) {
> 			int ret = 0;
> 			cond_resched();
> 			/*
> @@ -2179,6 +2232,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> 			if (group >= ngroups)
> 				group = 0;
> 
> +			ext4_mb_prefetch(ac, group);
> +
> 			/* This now checks without needing the buddy page */
> 			ret = ext4_mb_good_group(ac, group, cr);
> 			if (ret <= 0) {
> @@ -2872,6 +2927,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
> 			bio_put(discard_bio);
> 		}
> 	}
> +	sbi->s_mb_toscan0 = 1024;
> +	sbi->s_mb_toscan1 = 4096;
> +	sbi->s_mb_prefetch = 32;
> 
> 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
> 		ext4_free_data_in_buddy(sb, entry);
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index 88c98f17e3d9..9ba5c75e6490 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -175,6 +175,7 @@ struct ext4_allocation_context {
> 	struct page *ac_buddy_page;
> 	struct ext4_prealloc_space *ac_pa;
> 	struct ext4_locality_group *ac_lg;
> +	ext4_group_t ac_prefetch;
> };
> 
> #define AC_STATUS_CONTINUE	1
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index eb1efad0e20a..4476d828439b 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -198,6 +198,9 @@ EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
> EXT4_ATTR(first_error_time, 0444, first_error_time);
> EXT4_ATTR(last_error_time, 0444, last_error_time);
> EXT4_ATTR(journal_task, 0444, journal_task);
> +EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0);
> +EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1);
> +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
> 
> static unsigned int old_bump_val = 128;
> EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> @@ -228,6 +231,9 @@ static struct attribute *ext4_attrs[] = {
> 	ATTR_LIST(first_error_time),
> 	ATTR_LIST(last_error_time),
> 	ATTR_LIST(journal_task),
> +	ATTR_LIST(mb_toscan0),
> +	ATTR_LIST(mb_toscan1),
> +	ATTR_LIST(mb_prefetch),
> 	NULL,
> };
> ATTRIBUTE_GROUPS(ext4);
>
Theodore Ts'o Nov. 20, 2019, 6:13 p.m. UTC | #2
Hi Alex,

A couple of comments.  First, please separate this patch so that these
two separate pieces of functionality can be reviewed and tested
separately:

> 1) mballoc tries too hard to find the best chunk which is
>  counterproductive - it makes sense to limit this process

> 2) during scanning the bitmaps are loaded one by one, synchronously
>  - it makes sense to prefetch few groups at once

As far the prefetch is concerned, please note that the bitmap is first
read into the buffer cache via read_block_bitmap_nowait(), but then it
needs to be copied into buddy bitmap pages where it is cached along
side the buddy bitmap.  (The copy in the buddy bitmap is a combination
of the on-disk block allocation bitmap plus any outstanding
preallocations.)  From that copy of block bitmap, we then generate the
buddy bitmap and as a side effect, initialize the statistics
(grp->bb_first_free, grp->bb_largest_free_order, grp->bb_counters[]).

It is these statistics that we need to be able to make allocation
decisions for a particular block group.  So perhaps we should drive
the readahead of the bitmaps from ext4_mb_init_group() /
ext4_mb_init_cache(), and make sure that we actually initialize the
ext4_group_info structure, and not just read the bitmap into buffer
cache and hope it gets used before memory pressure pushes it out of
the buddy cache.

Andreas has suggested going even farther, and perhaps storing this
derived information from the allocation bitmaps someplace convenient
on disk.  This is an on-disk format change, so we would want to think
very carefully before going down that path.  Especially since if we're
going to go this far, perhaps we should consider using an on-disk
b-tree to store the allocation information, which could be more
efficient than using allocation bitmaps plus buddy bitmaps.

Cheers,

							- Ted
Alex Zhuravlev Nov. 20, 2019, 6:22 p.m. UTC | #3
Thanks for the feedback.

> On 20 Nov 2019, at 21:13, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> Hi Alex,
> 
> A couple of comments.  First, please separate this patch so that these
> two separate pieces of functionality can be reviewed and tested
> separately:

Sure, that makes sense.

> 
> As far the prefetch is concerned, please note that the bitmap is first
> read into the buffer cache via read_block_bitmap_nowait(), but then it
> needs to be copied into buddy bitmap pages where it is cached along
> side the buddy bitmap.  (The copy in the buddy bitmap is a combination
> of the on-disk block allocation bitmap plus any outstanding
> preallocations.)  From that copy of block bitmap, we then generate the
> buddy bitmap and as a side effect, initialize the statistics
> (grp->bb_first_free, grp->bb_largest_free_order, grp->bb_counters[]).

> It is these statistics that we need to be able to make allocation
> decisions for a particular block group.  So perhaps we should drive
> the readahead of the bitmaps from ext4_mb_init_group() /
> ext4_mb_init_cache(), and make sure that we actually initialize the
> ext4_group_info structure, and not just read the bitmap into buffer
> cache and hope it gets used before memory pressure pushes it out of
> the buddy cache.

Indeed, but the point is that majority of time is IO itself, so having bitmap
In the buffer cache should improve, right? Not that I’m against buddy
Initialisation, but this would add extra complexity and not that much of
performance, IMO

Memory pressure is a good point though. Do you think touching bitmap
bh/page could be enough to prevent early dropping?
I can introduce another IO completion routine to schedule buddy init.

> Andreas has suggested going even farther, and perhaps storing this
> derived information from the allocation bitmaps someplace convenient
> on disk.  This is an on-disk format change, so we would want to think
> very carefully before going down that path.  Especially since if we're
> going to go this far, perhaps we should consider using an on-disk
> b-tree to store the allocation information, which could be more
> efficient than using allocation bitmaps plus buddy bitmaps.

This is what I normally try to avoid, but in general no objection.

Thanks, Alex
Alex Zhuravlev Nov. 20, 2019, 6:33 p.m. UTC | #4
> On 20 Nov 2019, at 14:56, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
> Now I want to share my thougths about this topic.
> Here are slides from LAD2019 “Ldiskfs block allocator and aged file system" https://www.eofs.eu/_media/events/lad19/14_artem_blagodarenko-ldiskfs_block_allocator.pdf
> There is also patch https://lists.openwall.net/linux-ext4/2019/03/11/5 that was sent here, but it require to be improved.

Thanks, learning this interesting stuff..

Thanks, Alex
Alex Zhuravlev Nov. 21, 2019, 7:03 a.m. UTC | #5
> On 20 Nov 2019, at 21:13, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> Hi Alex,
> 
> A couple of comments.  First, please separate this patch so that these
> two separate pieces of functionality can be reviewed and tested
> separately:
> 

This is the first patch of the series.

Thanks, Alex

From 81c4b3b5a17d94525bbc6d2d89b20f6618b05bc6 Mon Sep 17 00:00:00 2001
From: Alex Zhuravlev <bzzz@whamcloud.com>
Date: Thu, 21 Nov 2019 09:53:13 +0300
Subject: [PATCH 1/2] ext4: limit scanning for a good group

at first two rounds to prevent situation when 10x-100x thousand
of groups are scanned, especially non-initialized groups.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/mballoc.c | 14 ++++++++++++--
 fs/ext4/sysfs.c   |  4 ++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 03db3e71676c..d4e47fdad87c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1480,6 +1480,8 @@ struct ext4_sb_info {
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
+	unsigned int s_mb_toscan0;
+	unsigned int s_mb_toscan1;
 
 	/* stats for buddy allocator */
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a3e2767bdf2f..cebd7d8df0b8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2098,7 +2098,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-	ext4_group_t ngroups, group, i;
+	ext4_group_t ngroups, toscan, group, i;
 	int cr;
 	int err = 0, first_err = 0;
 	struct ext4_sb_info *sbi;
@@ -2169,7 +2169,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		 */
 		group = ac->ac_g_ex.fe_group;
 
-		for (i = 0; i < ngroups; group++, i++) {
+		/* limit number of groups to scan at the first two rounds
+		 * when we hope to find something really good */
+		toscan = ngroups;
+		if (cr == 0)
+			toscan = sbi->s_mb_toscan0;
+		else if (cr == 1)
+			toscan = sbi->s_mb_toscan1;
+
+		for (i = 0; i < toscan; group++, i++) {
 			int ret = 0;
 			cond_resched();
 			/*
@@ -2872,6 +2880,8 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 			bio_put(discard_bio);
 		}
 	}
+	sbi->s_mb_toscan0 = 1024;
+	sbi->s_mb_toscan1 = 4096;
 
 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
 		ext4_free_data_in_buddy(sb, entry);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index eb1efad0e20a..c96ee20f5487 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -198,6 +198,8 @@ EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_ATTR(first_error_time, 0444, first_error_time);
 EXT4_ATTR(last_error_time, 0444, last_error_time);
 EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0);
+EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1);
 
 static unsigned int old_bump_val = 128;
 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -228,6 +230,8 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
 	ATTR_LIST(journal_task),
+	ATTR_LIST(mb_toscan0),
+	ATTR_LIST(mb_toscan1),
 	NULL,
 };
 ATTRIBUTE_GROUPS(ext4);
Alex Zhuravlev Nov. 21, 2019, 7:03 a.m. UTC | #6
> On 20 Nov 2019, at 21:13, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> Hi Alex,
> 
> A couple of comments.  First, please separate this patch so that these
> two separate pieces of functionality can be reviewed and tested
> separately:
> 

And this is the second one

Thanks, Alex

From d2ff76d76320ade5f53002aa522b6eccfa058d47 Mon Sep 17 00:00:00 2001
From: Alex Zhuravlev <bzzz@whamcloud.com>
Date: Thu, 21 Nov 2019 10:00:07 +0300
Subject: [PATCH 2/2] ext4: prefetch bitmaps during block allocation

when the cache is cold reading bitmaps one by one can slowdown
the process significantly, especially on legacy rotating drives.
---
 fs/ext4/balloc.c  | 12 ++++++++++--
 fs/ext4/ext4.h    |  4 +++-
 fs/ext4/mballoc.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/mballoc.h |  1 +
 fs/ext4/sysfs.c   |  2 ++
 5 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b202e00d93f..76547601384b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -404,7 +404,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+				 int ignore_locked)
 {
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -435,6 +436,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	if (bitmap_uptodate(bh))
 		goto verify;
 
+	if (ignore_locked && buffer_locked(bh)) {
+		/* buffer under IO already, do not wait
+		 * if called for prefetching */
+		err = 0;
+		goto out;
+	}
+
 	lock_buffer(bh);
 	if (bitmap_uptodate(bh)) {
 		unlock_buffer(bh);
@@ -524,7 +532,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	struct buffer_head *bh;
 	int err;
 
-	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	bh = ext4_read_block_bitmap_nowait(sb, block_group, 1);
 	if (IS_ERR(bh))
 		return bh;
 	err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d4e47fdad87c..2320d7e2f8d6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1482,6 +1482,7 @@ struct ext4_sb_info {
 	unsigned long s_mb_last_start;
 	unsigned int s_mb_toscan0;
 	unsigned int s_mb_toscan1;
+	unsigned int s_mb_prefetch;
 
 	/* stats for buddy allocator */
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
@@ -2335,7 +2336,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 
 extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
-						ext4_group_t block_group);
+						ext4_group_t block_group,
+						int ignore_locked);
 extern int ext4_wait_block_bitmap(struct super_block *sb,
 				  ext4_group_t block_group,
 				  struct buffer_head *bh);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cebd7d8df0b8..eac4ee225527 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 			bh[i] = NULL;
 			continue;
 		}
-		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+		bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0);
 		if (IS_ERR(bh[i])) {
 			err = PTR_ERR(bh[i]);
 			bh[i] = NULL;
@@ -2095,6 +2095,48 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
+/*
+ * each allocation context (i.e. a thread doing allocation) has own
+ * sliding prefetch window of @s_mb_prefetch size which starts at the
+ * very first goal and moves ahead of scaning.
+ * a side effect is that subsequent allocations will likely find
+ * the bitmaps in cache or at least in-flight.
+ */
+static void
+ext4_mb_prefetch(struct ext4_allocation_context *ac,
+		    ext4_group_t start)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_group_info *grp;
+	ext4_group_t group = start;
+	struct buffer_head *bh;
+	int nr;
+
+	/* batch prefetching to get few READs in flight */
+	if (group + (sbi->s_mb_prefetch >> 1) < ac->ac_prefetch)
+		return;
+
+	nr = sbi->s_mb_prefetch;
+	while (nr > 0) {
+		if (++group >= ngroups)
+			group = 0;
+		if (unlikely(group == start))
+			break;
+		grp = ext4_get_group_info(ac->ac_sb, group);
+		/* ignore empty groups - those will be skipped
+		 * during the scanning as well */
+		if (grp->bb_free == 0)
+			continue;
+		nr--;
+		if (!EXT4_MB_GRP_NEED_INIT(grp))
+			continue;
+		bh = ext4_read_block_bitmap_nowait(ac->ac_sb, group, 1);
+		brelse(bh);
+	}
+	ac->ac_prefetch = group;
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -2160,6 +2202,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	 * cr == 0 try to get exact allocation,
 	 * cr == 3  try to get anything
 	 */
+
+	ac->ac_prefetch = ac->ac_g_ex.fe_group;
+
 repeat:
 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
 		ac->ac_criteria = cr;
@@ -2187,6 +2232,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 			if (group >= ngroups)
 				group = 0;
 
+			ext4_mb_prefetch(ac, group);
+
 			/* This now checks without needing the buddy page */
 			ret = ext4_mb_good_group(ac, group, cr);
 			if (ret <= 0) {
@@ -2882,6 +2929,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 	}
 	sbi->s_mb_toscan0 = 1024;
 	sbi->s_mb_toscan1 = 4096;
+	sbi->s_mb_prefetch = 32;
 
 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
 		ext4_free_data_in_buddy(sb, entry);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..9ba5c75e6490 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -175,6 +175,7 @@ struct ext4_allocation_context {
 	struct page *ac_buddy_page;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
+	ext4_group_t ac_prefetch;
 };
 
 #define AC_STATUS_CONTINUE	1
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index c96ee20f5487..4476d828439b 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -200,6 +200,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time);
 EXT4_ATTR(journal_task, 0444, journal_task);
 EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0);
 EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
 
 static unsigned int old_bump_val = 128;
 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -232,6 +233,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(journal_task),
 	ATTR_LIST(mb_toscan0),
 	ATTR_LIST(mb_toscan1),
+	ATTR_LIST(mb_prefetch),
 	NULL,
 };
 ATTRIBUTE_GROUPS(ext4);
Artem Blagodarenko Nov. 21, 2019, 8:30 a.m. UTC | #7
Hello Alex,

Code looks good, but I have objections about the approach.

512TB disk with 4k block size have 4194304 groups. So 4k groups is only ~0.01% of whole disk.
Can we make decision to break search and get minimum blocks based on such limited data.
I am not sure that spending some time to find good group is worse then allocate blocks without 
optimisation. Especially, if disk is quite free and there are a lot of free block groups.

Best regards,
Artem Blagodarenko.
> On 21 Nov 2019, at 10:03, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
> 
> 
> 
>> On 20 Nov 2019, at 21:13, Theodore Y. Ts'o <tytso@mit.edu> wrote:
>> 
>> Hi Alex,
>> 
>> A couple of comments.  First, please separate this patch so that these
>> two separate pieces of functionality can be reviewed and tested
>> separately:
>> 
> 
> This is the first patch of the series.
> 
> Thanks, Alex
> 
> From 81c4b3b5a17d94525bbc6d2d89b20f6618b05bc6 Mon Sep 17 00:00:00 2001
> From: Alex Zhuravlev <bzzz@whamcloud.com>
> Date: Thu, 21 Nov 2019 09:53:13 +0300
> Subject: [PATCH 1/2] ext4: limit scanning for a good group
> 
> at first two rounds to prevent situation when 10x-100x thousand
> of groups are scanned, especially non-initialized groups.
> 
> Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
> ---
> fs/ext4/ext4.h    |  2 ++
> fs/ext4/mballoc.c | 14 ++++++++++++--
> fs/ext4/sysfs.c   |  4 ++++
> 3 files changed, 18 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 03db3e71676c..d4e47fdad87c 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1480,6 +1480,8 @@ struct ext4_sb_info {
> 	/* where last allocation was done - for stream allocation */
> 	unsigned long s_mb_last_group;
> 	unsigned long s_mb_last_start;
> +	unsigned int s_mb_toscan0;
> +	unsigned int s_mb_toscan1;
> 
> 	/* stats for buddy allocator */
> 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index a3e2767bdf2f..cebd7d8df0b8 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2098,7 +2098,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
> static noinline_for_stack int
> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> {
> -	ext4_group_t ngroups, group, i;
> +	ext4_group_t ngroups, toscan, group, i;
> 	int cr;
> 	int err = 0, first_err = 0;
> 	struct ext4_sb_info *sbi;
> @@ -2169,7 +2169,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> 		 */
> 		group = ac->ac_g_ex.fe_group;
> 
> -		for (i = 0; i < ngroups; group++, i++) {
> +		/* limit number of groups to scan at the first two rounds
> +		 * when we hope to find something really good */
> +		toscan = ngroups;
> +		if (cr == 0)
> +			toscan = sbi->s_mb_toscan0;
> +		else if (cr == 1)
> +			toscan = sbi->s_mb_toscan1;
> +
> +		for (i = 0; i < toscan; group++, i++) {
> 			int ret = 0;
> 			cond_resched();
> 			/*
> @@ -2872,6 +2880,8 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
> 			bio_put(discard_bio);
> 		}
> 	}
> +	sbi->s_mb_toscan0 = 1024;
> +	sbi->s_mb_toscan1 = 4096;
> 
> 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
> 		ext4_free_data_in_buddy(sb, entry);
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index eb1efad0e20a..c96ee20f5487 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -198,6 +198,8 @@ EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
> EXT4_ATTR(first_error_time, 0444, first_error_time);
> EXT4_ATTR(last_error_time, 0444, last_error_time);
> EXT4_ATTR(journal_task, 0444, journal_task);
> +EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0);
> +EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1);
> 
> static unsigned int old_bump_val = 128;
> EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> @@ -228,6 +230,8 @@ static struct attribute *ext4_attrs[] = {
> 	ATTR_LIST(first_error_time),
> 	ATTR_LIST(last_error_time),
> 	ATTR_LIST(journal_task),
> +	ATTR_LIST(mb_toscan0),
> +	ATTR_LIST(mb_toscan1),
> 	NULL,
> };
> ATTRIBUTE_GROUPS(ext4);
> -- 
> 2.20.1
> 
>
Alex Zhuravlev Nov. 21, 2019, 8:52 a.m. UTC | #8
> On 21 Nov 2019, at 11:30, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
> 
> Hello Alex,
> 
> Code looks good, but I have objections about the approach.
> 
> 512TB disk with 4k block size have 4194304 groups. So 4k groups is only ~0.01% of whole disk.
> Can we make decision to break search and get minimum blocks based on such limited data.
> I am not sure that spending some time to find good group is worse then allocate blocks without 
> optimisation. Especially, if disk is quite free and there are a lot of free block groups.

Exact number isn’t hardcoded and subject to discussion, but you don’t really want to scan 4M 
groups (especially uninitialised) to find “best” chunk.

This can be optimized further like “don’t count initialized and/or empty groups”, but still some limit
Is required, IMO. Notice this limit doesn’t apply if once we tried to find “best”, i.e. it’s applied only
with cr=0 and cr=1.


Thanks, Alex

> 
> Best regards,
> Artem Blagodarenko.
>> On 21 Nov 2019, at 10:03, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
>> 
>> 
>> 
>>> On 20 Nov 2019, at 21:13, Theodore Y. Ts'o <tytso@mit.edu> wrote:
>>> 
>>> Hi Alex,
>>> 
>>> A couple of comments.  First, please separate this patch so that these
>>> two separate pieces of functionality can be reviewed and tested
>>> separately:
>>> 
>> 
>> This is the first patch of the series.
>> 
>> Thanks, Alex
>> 
>> From 81c4b3b5a17d94525bbc6d2d89b20f6618b05bc6 Mon Sep 17 00:00:00 2001
>> From: Alex Zhuravlev <bzzz@whamcloud.com>
>> Date: Thu, 21 Nov 2019 09:53:13 +0300
>> Subject: [PATCH 1/2] ext4: limit scanning for a good group
>> 
>> at first two rounds to prevent situation when 10x-100x thousand
>> of groups are scanned, especially non-initialized groups.
>> 
>> Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
>> ---
>> fs/ext4/ext4.h    |  2 ++
>> fs/ext4/mballoc.c | 14 ++++++++++++--
>> fs/ext4/sysfs.c   |  4 ++++
>> 3 files changed, 18 insertions(+), 2 deletions(-)
>> 
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 03db3e71676c..d4e47fdad87c 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1480,6 +1480,8 @@ struct ext4_sb_info {
>> 	/* where last allocation was done - for stream allocation */
>> 	unsigned long s_mb_last_group;
>> 	unsigned long s_mb_last_start;
>> +	unsigned int s_mb_toscan0;
>> +	unsigned int s_mb_toscan1;
>> 
>> 	/* stats for buddy allocator */
>> 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index a3e2767bdf2f..cebd7d8df0b8 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -2098,7 +2098,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
>> static noinline_for_stack int
>> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>> {
>> -	ext4_group_t ngroups, group, i;
>> +	ext4_group_t ngroups, toscan, group, i;
>> 	int cr;
>> 	int err = 0, first_err = 0;
>> 	struct ext4_sb_info *sbi;
>> @@ -2169,7 +2169,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>> 		 */
>> 		group = ac->ac_g_ex.fe_group;
>> 
>> -		for (i = 0; i < ngroups; group++, i++) {
>> +		/* limit number of groups to scan at the first two rounds
>> +		 * when we hope to find something really good */
>> +		toscan = ngroups;
>> +		if (cr == 0)
>> +			toscan = sbi->s_mb_toscan0;
>> +		else if (cr == 1)
>> +			toscan = sbi->s_mb_toscan1;
>> +
>> +		for (i = 0; i < toscan; group++, i++) {
>> 			int ret = 0;
>> 			cond_resched();
>> 			/*
>> @@ -2872,6 +2880,8 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
>> 			bio_put(discard_bio);
>> 		}
>> 	}
>> +	sbi->s_mb_toscan0 = 1024;
>> +	sbi->s_mb_toscan1 = 4096;
>> 
>> 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
>> 		ext4_free_data_in_buddy(sb, entry);
>> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
>> index eb1efad0e20a..c96ee20f5487 100644
>> --- a/fs/ext4/sysfs.c
>> +++ b/fs/ext4/sysfs.c
>> @@ -198,6 +198,8 @@ EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
>> EXT4_ATTR(first_error_time, 0444, first_error_time);
>> EXT4_ATTR(last_error_time, 0444, last_error_time);
>> EXT4_ATTR(journal_task, 0444, journal_task);
>> +EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0);
>> +EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1);
>> 
>> static unsigned int old_bump_val = 128;
>> EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
>> @@ -228,6 +230,8 @@ static struct attribute *ext4_attrs[] = {
>> 	ATTR_LIST(first_error_time),
>> 	ATTR_LIST(last_error_time),
>> 	ATTR_LIST(journal_task),
>> +	ATTR_LIST(mb_toscan0),
>> +	ATTR_LIST(mb_toscan1),
>> 	NULL,
>> };
>> ATTRIBUTE_GROUPS(ext4);
>> -- 
>> 2.20.1
>> 
>> 
>
Artem Blagodarenko Nov. 21, 2019, 9:18 a.m. UTC | #9
> On 21 Nov 2019, at 11:52, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
> 
> 
> 
>> On 21 Nov 2019, at 11:30, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
>> 
>> Hello Alex,
>> 
>> Code looks good, but I have objections about the approach.
>> 
>> 512TB disk with 4k block size have 4194304 groups. So 4k groups is only ~0.01% of whole disk.
>> Can we make decision to break search and get minimum blocks based on such limited data.
>> I am not sure that spending some time to find good group is worse then allocate blocks without 
>> optimisation. Especially, if disk is quite free and there are a lot of free block groups.
> 
> Exact number isn’t hardcoded and subject to discussion, but you don’t really want to scan 4M 
> groups (especially uninitialised) to find “best” chunk.
> 

Then it looks reasonable to make it disabled by default (set 0, that means “do not limit”) and enable it by passing number to attribute.

> This can be optimized further like “don’t count initialized and/or empty groups”, but still some limit
> Is required, IMO. Notice this limit doesn’t apply if once we tried to find “best”, i.e. it’s applied only
> with cr=0 and cr=1.

cr=0 and cr=1 are also important. They allow to accelerate allocator, by increasing allocation window.
Step by step from smallest to the biggest value in preallocation table.
Your optimisation can reset this allocation window grow.

Assume we have one fragmented part of disk and all other parts are quite free.
Allocator will spend a lot of time to go through this fragmented part, because  will brake cr0 and cr1 and get
range that satisfy c3. 

c3 requirement is quite simple “get first group that have enough free blocks to allocate requested range”.
With hight probability allocator find such group at the start of c3 loop, so goal (allocator starts its searching from goal) will not significantly changed.
Thus allocator go through this fragmented range using small steps.

Without suggested optimisation, allocator skips this fragmented range at moment and continue to allocate blocks.

Best regards,
Artem Blagodarenko.

> 
> Thanks, Alex
> 
>> 
>> Best regards,
>> Artem Blagodarenko.
>>> On 21 Nov 2019, at 10:03, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
>>> 
>>> 
>>> 
>>>> On 20 Nov 2019, at 21:13, Theodore Y. Ts'o <tytso@mit.edu> wrote:
>>>> 
>>>> Hi Alex,
>>>> 
>>>> A couple of comments.  First, please separate this patch so that these
>>>> two separate pieces of functionality can be reviewed and tested
>>>> separately:
>>>> 
>>> 
>>> This is the first patch of the series.
>>> 
>>> Thanks, Alex
>>> 
>>> From 81c4b3b5a17d94525bbc6d2d89b20f6618b05bc6 Mon Sep 17 00:00:00 2001
>>> From: Alex Zhuravlev <bzzz@whamcloud.com>
>>> Date: Thu, 21 Nov 2019 09:53:13 +0300
>>> Subject: [PATCH 1/2] ext4: limit scanning for a good group
>>> 
>>> at first two rounds to prevent situation when 10x-100x thousand
>>> of groups are scanned, especially non-initialized groups.
>>> 
>>> Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
>>> ---
>>> fs/ext4/ext4.h    |  2 ++
>>> fs/ext4/mballoc.c | 14 ++++++++++++--
>>> fs/ext4/sysfs.c   |  4 ++++
>>> 3 files changed, 18 insertions(+), 2 deletions(-)
>>> 
>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>> index 03db3e71676c..d4e47fdad87c 100644
>>> --- a/fs/ext4/ext4.h
>>> +++ b/fs/ext4/ext4.h
>>> @@ -1480,6 +1480,8 @@ struct ext4_sb_info {
>>> 	/* where last allocation was done - for stream allocation */
>>> 	unsigned long s_mb_last_group;
>>> 	unsigned long s_mb_last_start;
>>> +	unsigned int s_mb_toscan0;
>>> +	unsigned int s_mb_toscan1;
>>> 
>>> 	/* stats for buddy allocator */
>>> 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>> index a3e2767bdf2f..cebd7d8df0b8 100644
>>> --- a/fs/ext4/mballoc.c
>>> +++ b/fs/ext4/mballoc.c
>>> @@ -2098,7 +2098,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
>>> static noinline_for_stack int
>>> ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>>> {
>>> -	ext4_group_t ngroups, group, i;
>>> +	ext4_group_t ngroups, toscan, group, i;
>>> 	int cr;
>>> 	int err = 0, first_err = 0;
>>> 	struct ext4_sb_info *sbi;
>>> @@ -2169,7 +2169,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>>> 		 */
>>> 		group = ac->ac_g_ex.fe_group;
>>> 
>>> -		for (i = 0; i < ngroups; group++, i++) {
>>> +		/* limit number of groups to scan at the first two rounds
>>> +		 * when we hope to find something really good */
>>> +		toscan = ngroups;
>>> +		if (cr == 0)
>>> +			toscan = sbi->s_mb_toscan0;
>>> +		else if (cr == 1)
>>> +			toscan = sbi->s_mb_toscan1;
>>> +
>>> +		for (i = 0; i < toscan; group++, i++) {
>>> 			int ret = 0;
>>> 			cond_resched();
>>> 			/*
>>> @@ -2872,6 +2880,8 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
>>> 			bio_put(discard_bio);
>>> 		}
>>> 	}
>>> +	sbi->s_mb_toscan0 = 1024;
>>> +	sbi->s_mb_toscan1 = 4096;
>>> 
>>> 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
>>> 		ext4_free_data_in_buddy(sb, entry);
>>> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
>>> index eb1efad0e20a..c96ee20f5487 100644
>>> --- a/fs/ext4/sysfs.c
>>> +++ b/fs/ext4/sysfs.c
>>> @@ -198,6 +198,8 @@ EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
>>> EXT4_ATTR(first_error_time, 0444, first_error_time);
>>> EXT4_ATTR(last_error_time, 0444, last_error_time);
>>> EXT4_ATTR(journal_task, 0444, journal_task);
>>> +EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0);
>>> +EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1);
>>> 
>>> static unsigned int old_bump_val = 128;
>>> EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
>>> @@ -228,6 +230,8 @@ static struct attribute *ext4_attrs[] = {
>>> 	ATTR_LIST(first_error_time),
>>> 	ATTR_LIST(last_error_time),
>>> 	ATTR_LIST(journal_task),
>>> +	ATTR_LIST(mb_toscan0),
>>> +	ATTR_LIST(mb_toscan1),
>>> 	NULL,
>>> };
>>> ATTRIBUTE_GROUPS(ext4);
>>> -- 
>>> 2.20.1
>>> 
>>> 
>> 
>
Alex Zhuravlev Nov. 21, 2019, 2:41 p.m. UTC | #10
> On 21 Nov 2019, at 12:18, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
> 
> cr=0 and cr=1 are also important. They allow to accelerate allocator, by increasing allocation window.
> Step by step from smallest to the biggest value in preallocation table.
> Your optimisation can reset this allocation window grow.

Does it?
If allocated chunk is smaller than goal one (to predicted filesize), then next allocation will try to allocate the remained, AFAIR.
But prediction doesn’t depend on the previous allocation, essentially it just align current file size + allocation to the next 2^n
(or from the table in other versions).

cr=0 is an optimisation on its own as it checks only 2^n chunks which is cheap in terms of cycles.
cr=1 is a more expensive, we skip groups likely having no good extents (using average), but we still search for the goal size.

> Assume we have one fragmented part of disk and all other parts are quite free.
> Allocator will spend a lot of time to go through this fragmented part, because  will brake cr0 and cr1 and get
> range that satisfy c3. 

Even at cr=3 we still search for the goal size.

Thus we shouldn’t really allocate bad chunks because we break cr=0 and cr=1, we just stop to look for nicely looking groups
and fallback to regular (more expensive) search for free extents.

> 
> c3 requirement is quite simple “get first group that have enough free blocks to allocate requested range”.

This is only group selection, then we try to find that extent within that group, can fail and move to the next group.
EXT4_MB_HINT_FIRST is set outside of the main cr=0..3 loop.

> With hight probability allocator find such group at the start of c3 loop, so goal (allocator starts its searching from goal) will not significantly changed.
> Thus allocator go through this fragmented range using small steps.
> 
> Without suggested optimisation, allocator skips this fragmented range at moment and continue to allocate blocks.

1000 groups * 5ms avg.time = 5 seconds to skip 1000 bad uninitialized groups. This is the real problem.
You mentioned 4M groups...

Thanks, Alex
Andreas Dilger Nov. 25, 2019, 9:39 p.m. UTC | #11
On Nov 21, 2019, at 7:41 AM, Alex Zhuravlev <azhuravlev@whamcloud.com> wrote:
> 
> On 21 Nov 2019, at 12:18, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
>> Assume we have one fragmented part of disk and all other parts are quite free.
>> Allocator will spend a lot of time to go through this fragmented part, because
>> will brake cr0 and cr1 and get range that satisfy c3.
> 
> Even at cr=3 we still search for the goal size.
> 
> Thus we shouldn’t really allocate bad chunks because we break cr=0 and cr=1,
> we just stop to look for nicely looking groups and fallback to regular (more
> expensive) search for free extents.

I think it is important to understand what the actual goal size is at this
point.  The filesystems where we are seeing problems are _huge_ (650TiB and
larger) and are relatively full (70% or more) but take tens of minutes to
finish mounting.  Lustre does some small writes at mount time, but it shouldn't
take so long to find some small allocations for the config log update.

The filesystems are automatically getting "s_stripe_size = 512" from mke2fs
(presumably from the underlying RAID), and I _think_ this is causing mballoc
to inflate the IO request to 8-16MB prealloc chunks, which would be much
harder to find, and unnecessary for a small allocation.

>> c3 requirement is quite simple “get first group that have enough free
>> blocks to allocate requested range”.
> 
> This is only group selection, then we try to find that extent within that
> group, can fail and move to the next group.
> EXT4_MB_HINT_FIRST is set outside of the main cr=0..3 loop.
> 
>> With hight probability allocator find such group at the start of c3 loop,
>> so goal (allocator starts its searching from goal) will not significantly
>> changed. Thus allocator go through this fragmented range using small steps.
>> 
>> Without suggested optimisation, allocator skips this fragmented range at
>> moment and continue to allocate blocks.
> 
> 1000 groups * 5ms avg.time = 5 seconds to skip 1000 bad uninitialized groups. This is the real problem. You mentioned 4M groups...

Yes, these filesystems have 5M or more groups, which is a real problem.
Alex is working on a patch to do prefetch of the bitmaps, and to read them
in chunks of flex_bg size (256 blocks = 1MB) to cut down on the number of
seeks needed to fetch them from disk.

Using bigalloc would also help, and getting the number of block groups lower
will avoid the need for meta_bg (which puts each group descriptor into a
separate group, rather than packed contiguously)  but we've had to fix a few
performance issues with bigalloc as well, and have not deployed it yet in
production.

Cheers, Andreas
Alex Zhuravlev Dec. 2, 2019, 8:46 a.m. UTC | #12
> On 26 Nov 2019, at 00:39, Andreas Dilger <adilger@dilger.ca> wrote:
> 
> I think it is important to understand what the actual goal size is at this
> point.  The filesystems where we are seeing problems are _huge_ (650TiB and
> larger) and are relatively full (70% or more) but take tens of minutes to
> finish mounting.  Lustre does some small writes at mount time, but it shouldn't
> take so long to find some small allocations for the config log update.
> 
> The filesystems are automatically getting "s_stripe_size = 512" from mke2fs
> (presumably from the underlying RAID), and I _think_ this is causing mballoc
> to inflate the IO request to 8-16MB prealloc chunks, which would be much
> harder to find, and unnecessary for a small allocation.
> 
Yes, I agree. It makes sense to limit group preallocation in cases like this.

Thanks, Alex
diff mbox series

Patch

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b202e00d93f..76547601384b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -404,7 +404,8 @@  static int ext4_validate_block_bitmap(struct super_block *sb,
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+				 int ignore_locked)
 {
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -435,6 +436,13 @@  ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	if (bitmap_uptodate(bh))
 		goto verify;
 
+	if (ignore_locked && buffer_locked(bh)) {
+		/* buffer under IO already, do not wait
+		 * if called for prefetching */
+		err = 0;
+		goto out;
+	}
+
 	lock_buffer(bh);
 	if (bitmap_uptodate(bh)) {
 		unlock_buffer(bh);
@@ -524,7 +532,7 @@  ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	struct buffer_head *bh;
 	int err;
 
-	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	bh = ext4_read_block_bitmap_nowait(sb, block_group, 1);
 	if (IS_ERR(bh))
 		return bh;
 	err = ext4_wait_block_bitmap(sb, block_group, bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 03db3e71676c..2320d7e2f8d6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1480,6 +1480,9 @@  struct ext4_sb_info {
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
+	unsigned int s_mb_toscan0;
+	unsigned int s_mb_toscan1;
+	unsigned int s_mb_prefetch;
 
 	/* stats for buddy allocator */
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
@@ -2333,7 +2336,8 @@  extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 
 extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
-						ext4_group_t block_group);
+						ext4_group_t block_group,
+						int ignore_locked);
 extern int ext4_wait_block_bitmap(struct super_block *sb,
 				  ext4_group_t block_group,
 				  struct buffer_head *bh);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a3e2767bdf2f..eac4ee225527 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -861,7 +861,7 @@  static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 			bh[i] = NULL;
 			continue;
 		}
-		bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+		bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0);
 		if (IS_ERR(bh[i])) {
 			err = PTR_ERR(bh[i]);
 			bh[i] = NULL;
@@ -2095,10 +2095,52 @@  static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
+/*
+ * each allocation context (i.e. a thread doing allocation) has own
+ * sliding prefetch window of @s_mb_prefetch size which starts at the
+ * very first goal and moves ahead of scaning.
+ * a side effect is that subsequent allocations will likely find
+ * the bitmaps in cache or at least in-flight.
+ */
+static void
+ext4_mb_prefetch(struct ext4_allocation_context *ac,
+		    ext4_group_t start)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_group_info *grp;
+	ext4_group_t group = start;
+	struct buffer_head *bh;
+	int nr;
+
+	/* batch prefetching to get few READs in flight */
+	if (group + (sbi->s_mb_prefetch >> 1) < ac->ac_prefetch)
+		return;
+
+	nr = sbi->s_mb_prefetch;
+	while (nr > 0) {
+		if (++group >= ngroups)
+			group = 0;
+		if (unlikely(group == start))
+			break;
+		grp = ext4_get_group_info(ac->ac_sb, group);
+		/* ignore empty groups - those will be skipped
+		 * during the scanning as well */
+		if (grp->bb_free == 0)
+			continue;
+		nr--;
+		if (!EXT4_MB_GRP_NEED_INIT(grp))
+			continue;
+		bh = ext4_read_block_bitmap_nowait(ac->ac_sb, group, 1);
+		brelse(bh);
+	}
+	ac->ac_prefetch = group;
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-	ext4_group_t ngroups, group, i;
+	ext4_group_t ngroups, toscan, group, i;
 	int cr;
 	int err = 0, first_err = 0;
 	struct ext4_sb_info *sbi;
@@ -2160,6 +2202,9 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	 * cr == 0 try to get exact allocation,
 	 * cr == 3  try to get anything
 	 */
+
+	ac->ac_prefetch = ac->ac_g_ex.fe_group;
+
 repeat:
 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
 		ac->ac_criteria = cr;
@@ -2169,7 +2214,15 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		 */
 		group = ac->ac_g_ex.fe_group;
 
-		for (i = 0; i < ngroups; group++, i++) {
+		/* limit number of groups to scan at the first two rounds
+		 * when we hope to find something really good */
+		toscan = ngroups;
+		if (cr == 0)
+			toscan = sbi->s_mb_toscan0;
+		else if (cr == 1)
+			toscan = sbi->s_mb_toscan1;
+
+		for (i = 0; i < toscan; group++, i++) {
 			int ret = 0;
 			cond_resched();
 			/*
@@ -2179,6 +2232,8 @@  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 			if (group >= ngroups)
 				group = 0;
 
+			ext4_mb_prefetch(ac, group);
+
 			/* This now checks without needing the buddy page */
 			ret = ext4_mb_good_group(ac, group, cr);
 			if (ret <= 0) {
@@ -2872,6 +2927,9 @@  void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 			bio_put(discard_bio);
 		}
 	}
+	sbi->s_mb_toscan0 = 1024;
+	sbi->s_mb_toscan1 = 4096;
+	sbi->s_mb_prefetch = 32;
 
 	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
 		ext4_free_data_in_buddy(sb, entry);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..9ba5c75e6490 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -175,6 +175,7 @@  struct ext4_allocation_context {
 	struct page *ac_buddy_page;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
+	ext4_group_t ac_prefetch;
 };
 
 #define AC_STATUS_CONTINUE	1
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index eb1efad0e20a..4476d828439b 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -198,6 +198,9 @@  EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_ATTR(first_error_time, 0444, first_error_time);
 EXT4_ATTR(last_error_time, 0444, last_error_time);
 EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0);
+EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
 
 static unsigned int old_bump_val = 128;
 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -228,6 +231,9 @@  static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
 	ATTR_LIST(journal_task),
+	ATTR_LIST(mb_toscan0),
+	ATTR_LIST(mb_toscan1),
+	ATTR_LIST(mb_prefetch),
 	NULL,
 };
 ATTRIBUTE_GROUPS(ext4);