diff mbox series

[v2,4/9] ext4: fix slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists()

Message ID 20240227091148.178435-5-libaokun1@huawei.com
State Superseded
Headers show
Series ext4: avoid sysfs variables overflow causing BUG_ON/SOOB | expand

Commit Message

Baokun Li Feb. 27, 2024, 9:11 a.m. UTC
We can trigger a slab-out-of-bounds with the following commands:

    mkfs.ext4 -F /dev/$disk 10G
    mount /dev/$disk /tmp/test
    echo 2147483647 > /sys/fs/ext4/$disk/mb_group_prealloc
    echo test > /tmp/test/file && sync

==================================================================
BUG: KASAN: slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
Read of size 8 at addr ffff888121b9d0f0 by task kworker/u2:0/11
CPU: 0 PID: 11 Comm: kworker/u2:0 Tainted: GL 6.7.0-next-20240118 #521
Call Trace:
 dump_stack_lvl+0x2c/0x50
 kasan_report+0xb6/0xf0
 ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
 ext4_mb_regular_allocator+0x19e9/0x2370 [ext4]
 ext4_mb_new_blocks+0x88a/0x1370 [ext4]
 ext4_ext_map_blocks+0x14f7/0x2390 [ext4]
 ext4_map_blocks+0x569/0xea0 [ext4]
 ext4_do_writepages+0x10f6/0x1bc0 [ext4]
[...]
==================================================================

The flow of issue triggering is as follows:

// Set s_mb_group_prealloc to 2147483647 via sysfs
ext4_mb_new_blocks
  ext4_mb_normalize_request
    ext4_mb_normalize_group_request
      ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc
  ext4_mb_regular_allocator
    ext4_mb_choose_next_group
      ext4_mb_choose_next_group_best_avail
        mb_avg_fragment_size_order
          order = fls(len) - 2 = 29
        ext4_mb_find_good_group_avg_frag_lists
          frag_list = &sbi->s_mb_avg_fragment_size[order]
          if (list_empty(frag_list)) // Trigger SOOB!

At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
to be triggered by an attempt to access an element at index 29.

Add a new attr_id attr_clusters_in_group with values in the range
[0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
that type to fix the issue. In addition avoid returning an order
from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
and reduce some useless loops.

Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
CC: stable@vger.kernel.org
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
 fs/ext4/mballoc.c |  6 ++++++
 fs/ext4/sysfs.c   | 13 ++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

Comments

Jan Kara March 14, 2024, 10:30 a.m. UTC | #1
On Tue 27-02-24 17:11:43, Baokun Li wrote:
> We can trigger a slab-out-of-bounds with the following commands:
> 
>     mkfs.ext4 -F /dev/$disk 10G
>     mount /dev/$disk /tmp/test
>     echo 2147483647 > /sys/fs/ext4/$disk/mb_group_prealloc
>     echo test > /tmp/test/file && sync
> 
> ==================================================================
> BUG: KASAN: slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
> Read of size 8 at addr ffff888121b9d0f0 by task kworker/u2:0/11
> CPU: 0 PID: 11 Comm: kworker/u2:0 Tainted: GL 6.7.0-next-20240118 #521
> Call Trace:
>  dump_stack_lvl+0x2c/0x50
>  kasan_report+0xb6/0xf0
>  ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4]
>  ext4_mb_regular_allocator+0x19e9/0x2370 [ext4]
>  ext4_mb_new_blocks+0x88a/0x1370 [ext4]
>  ext4_ext_map_blocks+0x14f7/0x2390 [ext4]
>  ext4_map_blocks+0x569/0xea0 [ext4]
>  ext4_do_writepages+0x10f6/0x1bc0 [ext4]
> [...]
> ==================================================================
> 
> The flow of issue triggering is as follows:
> 
> // Set s_mb_group_prealloc to 2147483647 via sysfs
> ext4_mb_new_blocks
>   ext4_mb_normalize_request
>     ext4_mb_normalize_group_request
>       ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc
>   ext4_mb_regular_allocator
>     ext4_mb_choose_next_group
>       ext4_mb_choose_next_group_best_avail
>         mb_avg_fragment_size_order
>           order = fls(len) - 2 = 29
>         ext4_mb_find_good_group_avg_frag_lists
>           frag_list = &sbi->s_mb_avg_fragment_size[order]
>           if (list_empty(frag_list)) // Trigger SOOB!
> 
> At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
> but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
> to be triggered by an attempt to access an element at index 29.
> 
> Add a new attr_id attr_clusters_in_group with values in the range
> [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
> that type to fix the issue. In addition avoid returning an order
> from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
> and reduce some useless loops.
> 
> Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
> CC: stable@vger.kernel.org
> Signed-off-by: Baokun Li <libaokun1@huawei.com>

Looks good. Just one nit below. Otherwise feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

> ---
>  fs/ext4/mballoc.c |  6 ++++++
>  fs/ext4/sysfs.c   | 13 ++++++++++++-
>  2 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 85a91a61b761..7ad089df2408 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
>  		return 0;
>  	if (order == MB_NUM_ORDERS(sb))
>  		order--;
> +	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
> +		order = MB_NUM_ORDERS(sb) - 1;
>  	return order;
>  }
>  
> @@ -1057,6 +1059,10 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
>  			ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
>  			return;
>  		}
> +
> +		/* Skip some unnecessary loops. */
> +		if (unlikely(i > MB_NUM_ORDERS(ac->ac_sb)))
> +			i = MB_NUM_ORDERS(ac->ac_sb);

How can this possibly trigger now? MB_NUM_ORDERS is sb->s_blocksize_bits +
2. 'i' is starting at fls(ac->ac_g_ex.fe_len) and ac_g_ex.fe_len shouldn't
be larger than clusters per group, hence fls() should be less than
sb->s_blocksize_bits? Am I missing something? And if yes, we should rather
make sure 'order' is never absurdly big?

I suspect this code is defensive upto a point of being confusing :)

								Honza
Baokun Li March 14, 2024, 11:24 a.m. UTC | #2
Hi Jan,

On 2024/3/14 18:30, Jan Kara wrote:
> On Tue 27-02-24 17:11:43, Baokun Li wrote:
>
>
> At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
> but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
> to be triggered by an attempt to access an element at index 29.
>
> Add a new attr_id attr_clusters_in_group with values in the range
> [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
> that type to fix the issue. In addition avoid returning an order
> from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
> and reduce some useless loops.
>
> Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
> CC: stable@vger.kernel.org
> Signed-off-by: Baokun Li <libaokun1@huawei.com>
> Looks good. Just one nit below. Otherwise feel free to add:
>
> Reviewed-by: Jan Kara <jack@suse.cz>
>
>> ---
>>   fs/ext4/mballoc.c |  6 ++++++
>>   fs/ext4/sysfs.c   | 13 ++++++++++++-
>>   2 files changed, 18 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 85a91a61b761..7ad089df2408 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
>>   		return 0;
>>   	if (order == MB_NUM_ORDERS(sb))
>>   		order--;
>> +	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
>> +		order = MB_NUM_ORDERS(sb) - 1;
>>   	return order;
>>   }
>>   
>> @@ -1057,6 +1059,10 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
>>   			ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
>>   			return;
>>   		}
>> +
>> +		/* Skip some unnecessary loops. */
>> +		if (unlikely(i > MB_NUM_ORDERS(ac->ac_sb)))
>> +			i = MB_NUM_ORDERS(ac->ac_sb);
> How can this possibly trigger now? MB_NUM_ORDERS is sb->s_blocksize_bits +
> 2. 'i' is starting at fls(ac->ac_g_ex.fe_len) and ac_g_ex.fe_len shouldn't
> be larger than clusters per group, hence fls() should be less than
> sb->s_blocksize_bits? Am I missing something? And if yes, we should rather
> make sure 'order' is never absurdly big?
>
> I suspect this code is defensive upto a point of being confusing :)
>
> Honza 

Yes, this is indeed defensive code! Only walk into this branch when
WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)) is triggered.
As previously mentioned by ojaswin in the following link:

"The reason for this is that otherwise when order is large eg 29,
we would unnecessarily loop from i=29 to i=13 while always
looking at the same avg_fragment_list[13]."

Link:https://lore.kernel.org/lkml/ZdQ7FEA7KC4eAMpg@li-bb2b2a4c-3307-11b2-a85c-8fa5c3a69313.ibm.com/

Thank you so much for the review! ღ( ´・ᴗ・` )
Jan Kara March 14, 2024, noon UTC | #3
On Thu 14-03-24 19:24:56, Baokun Li wrote:
> Hi Jan,
> 
> On 2024/3/14 18:30, Jan Kara wrote:
> > On Tue 27-02-24 17:11:43, Baokun Li wrote:
> > 
> > 
> > At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
> > but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
> > to be triggered by an attempt to access an element at index 29.
> > 
> > Add a new attr_id attr_clusters_in_group with values in the range
> > [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
> > that type to fix the issue. In addition avoid returning an order
> > from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
> > and reduce some useless loops.
> > 
> > Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
> > CC: stable@vger.kernel.org
> > Signed-off-by: Baokun Li <libaokun1@huawei.com>
> > Looks good. Just one nit below. Otherwise feel free to add:
> > 
> > Reviewed-by: Jan Kara <jack@suse.cz>
> > 
> > > ---
> > >   fs/ext4/mballoc.c |  6 ++++++
> > >   fs/ext4/sysfs.c   | 13 ++++++++++++-
> > >   2 files changed, 18 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > > index 85a91a61b761..7ad089df2408 100644
> > > --- a/fs/ext4/mballoc.c
> > > +++ b/fs/ext4/mballoc.c
> > > @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
> > >   		return 0;
> > >   	if (order == MB_NUM_ORDERS(sb))
> > >   		order--;
> > > +	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
> > > +		order = MB_NUM_ORDERS(sb) - 1;
> > >   	return order;
> > >   }
> > > @@ -1057,6 +1059,10 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
> > >   			ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
> > >   			return;
> > >   		}
> > > +
> > > +		/* Skip some unnecessary loops. */
> > > +		if (unlikely(i > MB_NUM_ORDERS(ac->ac_sb)))
> > > +			i = MB_NUM_ORDERS(ac->ac_sb);
> > How can this possibly trigger now? MB_NUM_ORDERS is sb->s_blocksize_bits +
> > 2. 'i' is starting at fls(ac->ac_g_ex.fe_len) and ac_g_ex.fe_len shouldn't
> > be larger than clusters per group, hence fls() should be less than
> > sb->s_blocksize_bits? Am I missing something? And if yes, we should rather
> > make sure 'order' is never absurdly big?
> > 
> > I suspect this code is defensive upto a point of being confusing :)
> > 
> > Honza
> 
> Yes, this is indeed defensive code! Only walk into this branch when
> WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)) is triggered.
> As previously mentioned by ojaswin in the following link:
> 
> "The reason for this is that otherwise when order is large eg 29,
> we would unnecessarily loop from i=29 to i=13 while always
> looking at the same avg_fragment_list[13]."
> 
> Link:https://lore.kernel.org/lkml/ZdQ7FEA7KC4eAMpg@li-bb2b2a4c-3307-11b2-a85c-8fa5c3a69313.ibm.com/
> 
> Thank you so much for the review! ღ( ´・ᴗ・` )

Thanks for the link. So what Ojaswin has suggested has been slightly
different though. He suggested to trim the order before the for loop, not
after the first iteration as you do which is what was confusing me. I'd
even suggest to replace your check with:

        /*      
         * mb_avg_fragment_size_order() returns order in a way that makes
         * retrieving back the length using (1 << order) inaccurate. Hence, use
         * fls() instead since we need to know the actual length while modifying
         * goal length.
         */     
-       order = fls(ac->ac_g_ex.fe_len) - 1;
+	order = min(fls(ac->ac_g_ex.fe_len), MB_NUM_ORDERS(ac->ac_sb)) - 1;
        min_order = order - sbi->s_mb_best_avail_max_trim_order;
        if (min_order < 0)
                min_order = 0;

								Honza
Baokun Li March 14, 2024, 12:37 p.m. UTC | #4
On 2024/3/14 20:00, Jan Kara wrote:
> On Thu 14-03-24 19:24:56, Baokun Li wrote:
>> Hi Jan,
>>
>> On 2024/3/14 18:30, Jan Kara wrote:
>>> On Tue 27-02-24 17:11:43, Baokun Li wrote:
>>>
>>>
>>> At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
>>> but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
>>> to be triggered by an attempt to access an element at index 29.
>>>
>>> Add a new attr_id attr_clusters_in_group with values in the range
>>> [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
>>> that type to fix the issue. In addition avoid returning an order
>>> from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
>>> and reduce some useless loops.
>>>
>>> Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
>>> CC: stable@vger.kernel.org
>>> Signed-off-by: Baokun Li <libaokun1@huawei.com>
>>> Looks good. Just one nit below. Otherwise feel free to add:
>>>
>>> Reviewed-by: Jan Kara <jack@suse.cz>
>>>
>>>> ---
>>>>    fs/ext4/mballoc.c |  6 ++++++
>>>>    fs/ext4/sysfs.c   | 13 ++++++++++++-
>>>>    2 files changed, 18 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>>> index 85a91a61b761..7ad089df2408 100644
>>>> --- a/fs/ext4/mballoc.c
>>>> +++ b/fs/ext4/mballoc.c
>>>> @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
>>>>    		return 0;
>>>>    	if (order == MB_NUM_ORDERS(sb))
>>>>    		order--;
>>>> +	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
>>>> +		order = MB_NUM_ORDERS(sb) - 1;
>>>>    	return order;
>>>>    }
>>>> @@ -1057,6 +1059,10 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
>>>>    			ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
>>>>    			return;
>>>>    		}
>>>> +
>>>> +		/* Skip some unnecessary loops. */
>>>> +		if (unlikely(i > MB_NUM_ORDERS(ac->ac_sb)))
>>>> +			i = MB_NUM_ORDERS(ac->ac_sb);
>>> How can this possibly trigger now? MB_NUM_ORDERS is sb->s_blocksize_bits +
>>> 2. 'i' is starting at fls(ac->ac_g_ex.fe_len) and ac_g_ex.fe_len shouldn't
>>> be larger than clusters per group, hence fls() should be less than
>>> sb->s_blocksize_bits? Am I missing something? And if yes, we should rather
>>> make sure 'order' is never absurdly big?
>>>
>>> I suspect this code is defensive upto a point of being confusing :)
>>>
>>> Honza
>> Yes, this is indeed defensive code! Only walk into this branch when
>> WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)) is triggered.
>> As previously mentioned by ojaswin in the following link:
>>
>> "The reason for this is that otherwise when order is large eg 29,
>> we would unnecessarily loop from i=29 to i=13 while always
>> looking at the same avg_fragment_list[13]."
>>
>> Link:https://lore.kernel.org/lkml/ZdQ7FEA7KC4eAMpg@li-bb2b2a4c-3307-11b2-a85c-8fa5c3a69313.ibm.com/
>>
>> Thank you so much for the review! ღ( ´・ᴗ・` )
> Thanks for the link. So what Ojaswin has suggested has been slightly
> different though. He suggested to trim the order before the for loop, not
> after the first iteration as you do which is what was confusing me. I'd
> even suggest to replace your check with:
>
>          /*
>           * mb_avg_fragment_size_order() returns order in a way that makes
>           * retrieving back the length using (1 << order) inaccurate. Hence, use
>           * fls() instead since we need to know the actual length while modifying
>           * goal length.
>           */
> -       order = fls(ac->ac_g_ex.fe_len) - 1;
> +	order = min(fls(ac->ac_g_ex.fe_len), MB_NUM_ORDERS(ac->ac_sb)) - 1;
>          min_order = order - sbi->s_mb_best_avail_max_trim_order;
>          if (min_order < 0)
>                  min_order = 0;
>
> 								Honza
Yes, I changed it that way because it only happens when an exception
somewhere causes fe_len to be a huge value. I think in this case we
should report the exception via WARN_ON_ONCE(), and trimming the
order before the for loop will bypass WARN_ON_ONCE and not report
any errors.

Thanks!
Jan Kara March 14, 2024, 12:50 p.m. UTC | #5
On Thu 14-03-24 20:37:38, Baokun Li wrote:
> On 2024/3/14 20:00, Jan Kara wrote:
> > On Thu 14-03-24 19:24:56, Baokun Li wrote:
> > > Hi Jan,
> > > 
> > > On 2024/3/14 18:30, Jan Kara wrote:
> > > > On Tue 27-02-24 17:11:43, Baokun Li wrote:
> > > > 
> > > > 
> > > > At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
> > > > but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
> > > > to be triggered by an attempt to access an element at index 29.
> > > > 
> > > > Add a new attr_id attr_clusters_in_group with values in the range
> > > > [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
> > > > that type to fix the issue. In addition avoid returning an order
> > > > from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
> > > > and reduce some useless loops.
> > > > 
> > > > Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
> > > > CC: stable@vger.kernel.org
> > > > Signed-off-by: Baokun Li <libaokun1@huawei.com>
> > > > Looks good. Just one nit below. Otherwise feel free to add:
> > > > 
> > > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > > 
> > > > > ---
> > > > >    fs/ext4/mballoc.c |  6 ++++++
> > > > >    fs/ext4/sysfs.c   | 13 ++++++++++++-
> > > > >    2 files changed, 18 insertions(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > > > > index 85a91a61b761..7ad089df2408 100644
> > > > > --- a/fs/ext4/mballoc.c
> > > > > +++ b/fs/ext4/mballoc.c
> > > > > @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
> > > > >    		return 0;
> > > > >    	if (order == MB_NUM_ORDERS(sb))
> > > > >    		order--;
> > > > > +	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
> > > > > +		order = MB_NUM_ORDERS(sb) - 1;
> > > > >    	return order;
> > > > >    }
> > > > > @@ -1057,6 +1059,10 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
> > > > >    			ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
> > > > >    			return;
> > > > >    		}
> > > > > +
> > > > > +		/* Skip some unnecessary loops. */
> > > > > +		if (unlikely(i > MB_NUM_ORDERS(ac->ac_sb)))
> > > > > +			i = MB_NUM_ORDERS(ac->ac_sb);
> > > > How can this possibly trigger now? MB_NUM_ORDERS is sb->s_blocksize_bits +
> > > > 2. 'i' is starting at fls(ac->ac_g_ex.fe_len) and ac_g_ex.fe_len shouldn't
> > > > be larger than clusters per group, hence fls() should be less than
> > > > sb->s_blocksize_bits? Am I missing something? And if yes, we should rather
> > > > make sure 'order' is never absurdly big?
> > > > 
> > > > I suspect this code is defensive upto a point of being confusing :)
> > > > 
> > > > Honza
> > > Yes, this is indeed defensive code! Only walk into this branch when
> > > WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)) is triggered.
> > > As previously mentioned by ojaswin in the following link:
> > > 
> > > "The reason for this is that otherwise when order is large eg 29,
> > > we would unnecessarily loop from i=29 to i=13 while always
> > > looking at the same avg_fragment_list[13]."
> > > 
> > > Link:https://lore.kernel.org/lkml/ZdQ7FEA7KC4eAMpg@li-bb2b2a4c-3307-11b2-a85c-8fa5c3a69313.ibm.com/
> > > 
> > > Thank you so much for the review! ღ( ´・ᴗ・` )
> > Thanks for the link. So what Ojaswin has suggested has been slightly
> > different though. He suggested to trim the order before the for loop, not
> > after the first iteration as you do which is what was confusing me. I'd
> > even suggest to replace your check with:
> > 
> >          /*
> >           * mb_avg_fragment_size_order() returns order in a way that makes
> >           * retrieving back the length using (1 << order) inaccurate. Hence, use
> >           * fls() instead since we need to know the actual length while modifying
> >           * goal length.
> >           */
> > -       order = fls(ac->ac_g_ex.fe_len) - 1;
> > +	order = min(fls(ac->ac_g_ex.fe_len), MB_NUM_ORDERS(ac->ac_sb)) - 1;
> >          min_order = order - sbi->s_mb_best_avail_max_trim_order;
> >          if (min_order < 0)
> >                  min_order = 0;
> > 
> > 								Honza
> Yes, I changed it that way because it only happens when an exception
> somewhere causes fe_len to be a huge value. I think in this case we
> should report the exception via WARN_ON_ONCE(), and trimming the
> order before the for loop will bypass WARN_ON_ONCE and not report
> any errors.

Fair enough. Then:
         /*
          * mb_avg_fragment_size_order() returns order in a way that makes
          * retrieving back the length using (1 << order) inaccurate. Hence, use
          * fls() instead since we need to know the actual length while modifying
          * goal length.
          */
	order = fls(ac->ac_g_ex.fe_len) - 1;
+	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(ac->ac_sb) - 1))
+		order = MB_NUM_ORDERS(ac->ac_sb) - 1;
        min_order = order - sbi->s_mb_best_avail_max_trim_order;
        if (min_order < 0)
                min_order = 0;

Still much less confusing...

								Honza
Baokun Li March 14, 2024, 1:47 p.m. UTC | #6
On 2024/3/14 20:50, Jan Kara wrote:
> On Thu 14-03-24 20:37:38, Baokun Li wrote:
>> On 2024/3/14 20:00, Jan Kara wrote:
>>> On Thu 14-03-24 19:24:56, Baokun Li wrote:
>>>> Hi Jan,
>>>>
>>>> On 2024/3/14 18:30, Jan Kara wrote:
>>>>> On Tue 27-02-24 17:11:43, Baokun Li wrote:
>>>>>
>>>>>
>>>>> At 4k block size, the length of the s_mb_avg_fragment_size list is 14,
>>>>> but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds
>>>>> to be triggered by an attempt to access an element at index 29.
>>>>>
>>>>> Add a new attr_id attr_clusters_in_group with values in the range
>>>>> [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as
>>>>> that type to fix the issue. In addition avoid returning an order
>>>>> from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb)
>>>>> and reduce some useless loops.
>>>>>
>>>>> Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)")
>>>>> CC: stable@vger.kernel.org
>>>>> Signed-off-by: Baokun Li <libaokun1@huawei.com>
>>>>> Looks good. Just one nit below. Otherwise feel free to add:
>>>>>
>>>>> Reviewed-by: Jan Kara <jack@suse.cz>
>>>>>
>>>>>> ---
>>>>>>     fs/ext4/mballoc.c |  6 ++++++
>>>>>>     fs/ext4/sysfs.c   | 13 ++++++++++++-
>>>>>>     2 files changed, 18 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>>>>>> index 85a91a61b761..7ad089df2408 100644
>>>>>> --- a/fs/ext4/mballoc.c
>>>>>> +++ b/fs/ext4/mballoc.c
>>>>>> @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
>>>>>>     		return 0;
>>>>>>     	if (order == MB_NUM_ORDERS(sb))
>>>>>>     		order--;
>>>>>> +	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
>>>>>> +		order = MB_NUM_ORDERS(sb) - 1;
>>>>>>     	return order;
>>>>>>     }
>>>>>> @@ -1057,6 +1059,10 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
>>>>>>     			ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
>>>>>>     			return;
>>>>>>     		}
>>>>>> +
>>>>>> +		/* Skip some unnecessary loops. */
>>>>>> +		if (unlikely(i > MB_NUM_ORDERS(ac->ac_sb)))
>>>>>> +			i = MB_NUM_ORDERS(ac->ac_sb);
>>>>> How can this possibly trigger now? MB_NUM_ORDERS is sb->s_blocksize_bits +
>>>>> 2. 'i' is starting at fls(ac->ac_g_ex.fe_len) and ac_g_ex.fe_len shouldn't
>>>>> be larger than clusters per group, hence fls() should be less than
>>>>> sb->s_blocksize_bits? Am I missing something? And if yes, we should rather
>>>>> make sure 'order' is never absurdly big?
>>>>>
>>>>> I suspect this code is defensive upto a point of being confusing :)
>>>>>
>>>>> Honza
>>>> Yes, this is indeed defensive code! Only walk into this branch when
>>>> WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)) is triggered.
>>>> As previously mentioned by ojaswin in the following link:
>>>>
>>>> "The reason for this is that otherwise when order is large eg 29,
>>>> we would unnecessarily loop from i=29 to i=13 while always
>>>> looking at the same avg_fragment_list[13]."
>>>>
>>>> Link:https://lore.kernel.org/lkml/ZdQ7FEA7KC4eAMpg@li-bb2b2a4c-3307-11b2-a85c-8fa5c3a69313.ibm.com/
>>>>
>>>> Thank you so much for the review! ღ( ´・ᴗ・` )
>>> Thanks for the link. So what Ojaswin has suggested has been slightly
>>> different though. He suggested to trim the order before the for loop, not
>>> after the first iteration as you do which is what was confusing me. I'd
>>> even suggest to replace your check with:
>>>
>>>           /*
>>>            * mb_avg_fragment_size_order() returns order in a way that makes
>>>            * retrieving back the length using (1 << order) inaccurate. Hence, use
>>>            * fls() instead since we need to know the actual length while modifying
>>>            * goal length.
>>>            */
>>> -       order = fls(ac->ac_g_ex.fe_len) - 1;
>>> +	order = min(fls(ac->ac_g_ex.fe_len), MB_NUM_ORDERS(ac->ac_sb)) - 1;
>>>           min_order = order - sbi->s_mb_best_avail_max_trim_order;
>>>           if (min_order < 0)
>>>                   min_order = 0;
>>>
>>> 								Honza
>> Yes, I changed it that way because it only happens when an exception
>> somewhere causes fe_len to be a huge value. I think in this case we
>> should report the exception via WARN_ON_ONCE(), and trimming the
>> order before the for loop will bypass WARN_ON_ONCE and not report
>> any errors.
> Fair enough. Then:
>           /*
>            * mb_avg_fragment_size_order() returns order in a way that makes
>            * retrieving back the length using (1 << order) inaccurate. Hence, use
>            * fls() instead since we need to know the actual length while modifying
>            * goal length.
>            */
> 	order = fls(ac->ac_g_ex.fe_len) - 1;
> +	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(ac->ac_sb) - 1))
> +		order = MB_NUM_ORDERS(ac->ac_sb) - 1;
>          min_order = order - sbi->s_mb_best_avail_max_trim_order;
>          if (min_order < 0)
>                  min_order = 0;
>
> Still much less confusing...
>
> 								Honza
Yes this does look much better!
Let me send v3!

Thanks for the suggestion!
diff mbox series

Patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 85a91a61b761..7ad089df2408 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -831,6 +831,8 @@  static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
 		return 0;
 	if (order == MB_NUM_ORDERS(sb))
 		order--;
+	if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
+		order = MB_NUM_ORDERS(sb) - 1;
 	return order;
 }
 
@@ -1057,6 +1059,10 @@  static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
 			ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
 			return;
 		}
+
+		/* Skip some unnecessary loops. */
+		if (unlikely(i > MB_NUM_ORDERS(ac->ac_sb)))
+			i = MB_NUM_ORDERS(ac->ac_sb);
 	}
 
 	/* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 7f455b5f22c0..ddd71673176c 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -29,6 +29,7 @@  typedef enum {
 	attr_trigger_test_error,
 	attr_first_error_time,
 	attr_last_error_time,
+	attr_clusters_in_group,
 	attr_feature,
 	attr_pointer_ui,
 	attr_pointer_ul,
@@ -207,13 +208,14 @@  EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
 		 ext4_sb_info, s_inode_readahead_blks);
+EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group,
+		 ext4_sb_info, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
@@ -376,6 +378,7 @@  static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
 
 	switch (a->attr_id) {
 	case attr_inode_readahead:
+	case attr_clusters_in_group:
 	case attr_pointer_ui:
 		if (a->attr_ptr == ptr_ext4_super_block_offset)
 			return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
@@ -455,6 +458,14 @@  static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
 		else
 			*((unsigned int *) ptr) = t;
 		return len;
+	case attr_clusters_in_group:
+		ret = kstrtouint(skip_spaces(buf), 0, &t);
+		if (ret)
+			return ret;
+		if (t > sbi->s_clusters_per_group)
+			return -EINVAL;
+		*((unsigned int *) ptr) = t;
+		return len;
 	case attr_pointer_ul:
 		ret = kstrtoul(skip_spaces(buf), 0, &lt);
 		if (ret)