diff mbox series

[RFC] ext4: dio take shared inode lock when overwriting preallocated blocks

Message ID 20221203103956.3691847-1-yi.zhang@huawei.com
State New
Headers show
Series [RFC] ext4: dio take shared inode lock when overwriting preallocated blocks | expand

Commit Message

Zhang Yi Dec. 3, 2022, 10:39 a.m. UTC
In the dio write path, we only take shared inode lock for the case of
aligned overwriting initialized blocks inside EOF. But for overwriting
preallocated blocks, it may only need to split unwritten extents, this
procedure has been protected under i_data_sem lock, it's safe to
release the exclusive inode lock and take shared inode lock.

This could give a significant speed up for multi-threaded writes. Test
on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.

 direct=1
 ioengine=libaio
 iodepth=10
 numjobs=10
 runtime=60
 rw=randwrite
 size=100G

And the test result are:
Before:
 bs=4k       IOPS=11.1k, BW=43.2MiB/s
 bs=16k      IOPS=11.1k, BW=173MiB/s
 bs=64k      IOPS=11.2k, BW=697MiB/s

After:
 bs=4k       IOPS=41.4k, BW=162MiB/s
 bs=16k      IOPS=41.3k, BW=646MiB/s
 bs=64k      IOPS=13.5k, BW=843MiB/s

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 It passed xfstests auto mode with 1k and 4k blocksize.

 fs/ext4/file.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

Comments

Zhang Yi Dec. 14, 2022, 1:44 p.m. UTC | #1
Hello, is anybody have advice?

Thanks,
Yi.

On 2022/12/3 18:39, Zhang Yi wrote:
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.
> 
> This could give a significant speed up for multi-threaded writes. Test
> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
> 
>  direct=1
>  ioengine=libaio
>  iodepth=10
>  numjobs=10
>  runtime=60
>  rw=randwrite
>  size=100G
> 
> And the test result are:
> Before:
>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>  bs=16k      IOPS=11.1k, BW=173MiB/s
>  bs=64k      IOPS=11.2k, BW=697MiB/s
> 
> After:
>  bs=4k       IOPS=41.4k, BW=162MiB/s
>  bs=16k      IOPS=41.3k, BW=646MiB/s
>  bs=64k      IOPS=13.5k, BW=843MiB/s
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  It passed xfstests auto mode with 1k and 4k blocksize.
> 
>  fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>  1 file changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index a7a597c727e6..7edac94025ac 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>  	return false;
>  }
>  
> -/* Is IO overwriting allocated and initialized blocks? */
> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> +/* Is IO overwriting allocated or initialized blocks? */
> +static bool ext4_overwrite_io(struct inode *inode,
> +			      loff_t pos, loff_t len, bool *inited)
>  {
>  	struct ext4_map_blocks map;
>  	unsigned int blkbits = inode->i_blkbits;
> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>  	blklen = map.m_len;
>  
>  	err = ext4_map_blocks(NULL, inode, &map, 0);
> +	if (err != blklen)
> +		return false;
>  	/*
>  	 * 'err==len' means that all of the blocks have been preallocated,
> -	 * regardless of whether they have been initialized or not. To exclude
> -	 * unwritten extents, we need to check m_flags.
> +	 * regardless of whether they have been initialized or not. We need to
> +	 * check m_flags to distinguish the unwritten extents.
>  	 */
> -	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
> +	*inited = !!(map.m_flags & EXT4_MAP_MAPPED);
> +	return true;
>  }
>  
>  static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>   * - For extending writes case we don't take the shared lock, since it requires
>   *   updating inode i_disksize and/or orphan handling with exclusive lock.
>   *
> - * - shared locking will only be true mostly with overwrites. Otherwise we will
> - *   switch to exclusive i_rwsem lock.
> + * - shared locking will only be true mostly with overwrites, include
> + *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
> + *   we protects splitting extents by i_data_sem in ext4_inode_info, so we can
> + *   also release exclusive i_rwsem lock.
> + *
> + * - Otherwise we will switch to exclusive i_rwsem lock.
>   */
>  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> -				     bool *ilock_shared, bool *extend)
> +				     bool *ilock_shared, bool *extend,
> +				     bool *overwrite)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(file);
> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>  	 * in file_modified().
>  	 */
>  	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> -	     !ext4_overwrite_io(inode, offset, count))) {
> +	     !ext4_overwrite_io(inode, offset, count, overwrite))) {
>  		if (iocb->ki_flags & IOCB_NOWAIT) {
>  			ret = -EAGAIN;
>  			goto out;
> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	loff_t offset = iocb->ki_pos;
>  	size_t count = iov_iter_count(from);
>  	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
> -	bool extend = false, unaligned_io = false;
> +	bool extend = false, unaligned_io = false, overwrite = false;
>  	bool ilock_shared = true;
>  
>  	/*
> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		return ext4_buffered_write_iter(iocb, from);
>  	}
>  
> -	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
> +	ret = ext4_dio_write_checks(iocb, from,
> +				    &ilock_shared, &extend, &overwrite);
>  	if (ret <= 0)
>  		return ret;
>  
> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		ext4_journal_stop(handle);
>  	}
>  
> -	if (ilock_shared)
> +	if (overwrite)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>  			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
>
Jan Kara Dec. 14, 2022, 5:01 p.m. UTC | #2
On Sat 03-12-22 18:39:56, Zhang Yi wrote:
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.
> 
> This could give a significant speed up for multi-threaded writes. Test
> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
> 
>  direct=1
>  ioengine=libaio
>  iodepth=10
>  numjobs=10
>  runtime=60
>  rw=randwrite
>  size=100G
> 
> And the test result are:
> Before:
>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>  bs=16k      IOPS=11.1k, BW=173MiB/s
>  bs=64k      IOPS=11.2k, BW=697MiB/s
> 
> After:
>  bs=4k       IOPS=41.4k, BW=162MiB/s
>  bs=16k      IOPS=41.3k, BW=646MiB/s
>  bs=64k      IOPS=13.5k, BW=843MiB/s
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  It passed xfstests auto mode with 1k and 4k blocksize.

Besides some naming nits (see below) I think this should work. But I have
to say I'm a bit uneasy about this because we will now be changing block
mapping from unwritten to written only with shared i_rwsem. OTOH that
happens during writeback as well so we should be fine and the gain is very
nice.

Out of curiosity do you have a real usecase for this?

>  fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>  1 file changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index a7a597c727e6..7edac94025ac 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>  	return false;
>  }
>  
> -/* Is IO overwriting allocated and initialized blocks? */
> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> +/* Is IO overwriting allocated or initialized blocks? */
> +static bool ext4_overwrite_io(struct inode *inode,
> +			      loff_t pos, loff_t len, bool *inited)

'inited' just sounds weird. Correct is 'initialized' which is a bit long.
Maybe just negate the meaning and call this 'unwritten'?

>  {
>  	struct ext4_map_blocks map;
>  	unsigned int blkbits = inode->i_blkbits;
> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>  	blklen = map.m_len;
>  
>  	err = ext4_map_blocks(NULL, inode, &map, 0);
> +	if (err != blklen)
> +		return false;
>  	/*
>  	 * 'err==len' means that all of the blocks have been preallocated,
> -	 * regardless of whether they have been initialized or not. To exclude
> -	 * unwritten extents, we need to check m_flags.
> +	 * regardless of whether they have been initialized or not. We need to
> +	 * check m_flags to distinguish the unwritten extents.
>  	 */
> -	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
> +	*inited = !!(map.m_flags & EXT4_MAP_MAPPED);
> +	return true;
>  }
>  
>  static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>   * - For extending writes case we don't take the shared lock, since it requires
>   *   updating inode i_disksize and/or orphan handling with exclusive lock.
>   *
> - * - shared locking will only be true mostly with overwrites. Otherwise we will
> - *   switch to exclusive i_rwsem lock.
> + * - shared locking will only be true mostly with overwrites, include
> + *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
> + *   we protects splitting extents by i_data_sem in ext4_inode_info, so we can
> + *   also release exclusive i_rwsem lock.
> + *
> + * - Otherwise we will switch to exclusive i_rwsem lock.
>   */
>  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> -				     bool *ilock_shared, bool *extend)
> +				     bool *ilock_shared, bool *extend,
> +				     bool *overwrite)

And it would be good to preserve the argument name in other functions as
well - i.e., keep using 'unwritten' here as well.

>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(file);
> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>  	 * in file_modified().
>  	 */
>  	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> -	     !ext4_overwrite_io(inode, offset, count))) {
> +	     !ext4_overwrite_io(inode, offset, count, overwrite))) {
>  		if (iocb->ki_flags & IOCB_NOWAIT) {
>  			ret = -EAGAIN;
>  			goto out;
> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	loff_t offset = iocb->ki_pos;
>  	size_t count = iov_iter_count(from);
>  	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
> -	bool extend = false, unaligned_io = false;
> +	bool extend = false, unaligned_io = false, overwrite = false;
>  	bool ilock_shared = true;
>  
>  	/*
> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		return ext4_buffered_write_iter(iocb, from);
>  	}
>  
> -	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
> +	ret = ext4_dio_write_checks(iocb, from,
> +				    &ilock_shared, &extend, &overwrite);
>  	if (ret <= 0)
>  		return ret;
>  
> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		ext4_journal_stop(handle);
>  	}
>  
> -	if (ilock_shared)
> +	if (overwrite)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>  			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
Theodore Ts'o Dec. 14, 2022, 6:52 p.m. UTC | #3
On Wed, Dec 14, 2022 at 06:01:25PM +0100, Jan Kara wrote:
> 
> Besides some naming nits (see below) I think this should work. But I have
> to say I'm a bit uneasy about this because we will now be changing block
> mapping from unwritten to written only with shared i_rwsem. OTOH that
> happens during writeback as well so we should be fine and the gain is very
> nice.

Hmm.... when I was looking potential impacts of the change what
ext4_overwrite_io() would do, I looked at the current user of that
function in ext4_dio_write_checks().

	/*
	 * Determine whether the IO operation will overwrite allocated
	 * and initialized blocks.
	 * We need exclusive i_rwsem for changing security info
	 * in file_modified().
	 */
	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
	     !ext4_overwrite_io(inode, offset, count))) {
		if (iocb->ki_flags & IOCB_NOWAIT) {
			ret = -EAGAIN;
			goto out;
		}
		inode_unlock_shared(inode);
		*ilock_shared = false;
		inode_lock(inode);
		goto restart;
	}

	ret = file_modified(file);
	if (ret < 0)
		goto out;

What is confusing me is the comment, "We need exclusive i_rwsem for
changing security info in file_modified().".  But then we end up
calling file_modified() unconditionally, regardless of whether we've
transitioned from a shared lock to an exclusive lock.

So file_modified() can get called either with or without the inode
locked r/w.  I realize that this patch doesn't change this
inconsistency, but it appears either the comment is wrong, or the code
is wrong.

What am I missing?

						- Ted
Zhang Yi Dec. 15, 2022, 8:24 a.m. UTC | #4
On 2022/12/15 1:01, Jan Kara wrote:
> On Sat 03-12-22 18:39:56, Zhang Yi wrote:
>> In the dio write path, we only take shared inode lock for the case of
>> aligned overwriting initialized blocks inside EOF. But for overwriting
>> preallocated blocks, it may only need to split unwritten extents, this
>> procedure has been protected under i_data_sem lock, it's safe to
>> release the exclusive inode lock and take shared inode lock.
>>
>> This could give a significant speed up for multi-threaded writes. Test
>> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>>
>>  direct=1
>>  ioengine=libaio
>>  iodepth=10
>>  numjobs=10
>>  runtime=60
>>  rw=randwrite
>>  size=100G
>>
>> And the test result are:
>> Before:
>>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>>  bs=16k      IOPS=11.1k, BW=173MiB/s
>>  bs=64k      IOPS=11.2k, BW=697MiB/s
>>
>> After:
>>  bs=4k       IOPS=41.4k, BW=162MiB/s
>>  bs=16k      IOPS=41.3k, BW=646MiB/s
>>  bs=64k      IOPS=13.5k, BW=843MiB/s
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>  It passed xfstests auto mode with 1k and 4k blocksize.
> 
> Besides some naming nits (see below) I think this should work. But I have
> to say I'm a bit uneasy about this because we will now be changing block
> mapping from unwritten to written only with shared i_rwsem. OTOH that
> happens during writeback as well so we should be fine and the gain is very
> nice.
> 
Thanks for advice, I will change the argument name to make it more reasonable.

> Out of curiosity do you have a real usecase for this?

No, I was just analyse the performance gap in our benchmark tests, and have
some question and idea while reading the code. Maybe it could speed up the
first time write in some database. :)

Besides, I want to discuss it a bit more. I originally changed this code to
switch to take the shared inode and also use ext4_iomap_overwrite_ops for
the overwriting preallocated blocks case. It will postpone the splitting extent
procedure to endio and could give a much more gain than this patch (+~27%).

This patch:
  bs=4k       IOPS=41.4k, BW=162MiB/s
Postpone split:
  bs=4k       IOPS=52.9k, BW=207MiB/s

But I think it's maybe too radical. I looked at the history and notice in commit
0031462b5b39 ("ext4: Split uninitialized extents for direct I/O"), it introduce
the flag EXT4_GET_BLOCKS_DIO(now it had been renamed to EXT4_GET_BLOCKS_PRE_IO)
to make sure that the preallocated unwritten extent have been splitted before
submitting the I/O, which is used to prevent the ENOSPC problem if the filesystem
run out-of-space in the endio procedure. And 4 years later, commit 27dd43854227
("ext4: introduce reserved space") reserve some blocks for metedata allocation.
It looks like this commit could also slove the ENOSPC problem for most cases if
we postpone extent splitting into the endio procedure. I don't find other side
effect, so I think it may also fine if we do that. Do you have some advice or
am I missing something?

Thanks,
Yi.

> 
>>  fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>>  1 file changed, 22 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index a7a597c727e6..7edac94025ac 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>>  	return false;
>>  }
>>  
>> -/* Is IO overwriting allocated and initialized blocks? */
>> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>> +/* Is IO overwriting allocated or initialized blocks? */
>> +static bool ext4_overwrite_io(struct inode *inode,
>> +			      loff_t pos, loff_t len, bool *inited)
> 
> 'inited' just sounds weird. Correct is 'initialized' which is a bit long.
> Maybe just negate the meaning and call this 'unwritten'?
> 
>>  {
>>  	struct ext4_map_blocks map;
>>  	unsigned int blkbits = inode->i_blkbits;
>> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>>  	blklen = map.m_len;
>>  
>>  	err = ext4_map_blocks(NULL, inode, &map, 0);
>> +	if (err != blklen)
>> +		return false;
>>  	/*
>>  	 * 'err==len' means that all of the blocks have been preallocated,
>> -	 * regardless of whether they have been initialized or not. To exclude
>> -	 * unwritten extents, we need to check m_flags.
>> +	 * regardless of whether they have been initialized or not. We need to
>> +	 * check m_flags to distinguish the unwritten extents.
>>  	 */
>> -	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
>> +	*inited = !!(map.m_flags & EXT4_MAP_MAPPED);
>> +	return true;
>>  }
>>  
>>  static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
>> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>>   * - For extending writes case we don't take the shared lock, since it requires
>>   *   updating inode i_disksize and/or orphan handling with exclusive lock.
>>   *
>> - * - shared locking will only be true mostly with overwrites. Otherwise we will
>> - *   switch to exclusive i_rwsem lock.
>> + * - shared locking will only be true mostly with overwrites, include
>> + *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
>> + *   we protects splitting extents by i_data_sem in ext4_inode_info, so we can
>> + *   also release exclusive i_rwsem lock.
>> + *
>> + * - Otherwise we will switch to exclusive i_rwsem lock.
>>   */
>>  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>> -				     bool *ilock_shared, bool *extend)
>> +				     bool *ilock_shared, bool *extend,
>> +				     bool *overwrite)
> 
> And it would be good to preserve the argument name in other functions as
> well - i.e., keep using 'unwritten' here as well.
> 
>>  {
>>  	struct file *file = iocb->ki_filp;
>>  	struct inode *inode = file_inode(file);
>> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>>  	 * in file_modified().
>>  	 */
>>  	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
>> -	     !ext4_overwrite_io(inode, offset, count))) {
>> +	     !ext4_overwrite_io(inode, offset, count, overwrite))) {
>>  		if (iocb->ki_flags & IOCB_NOWAIT) {
>>  			ret = -EAGAIN;
>>  			goto out;
>> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  	loff_t offset = iocb->ki_pos;
>>  	size_t count = iov_iter_count(from);
>>  	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
>> -	bool extend = false, unaligned_io = false;
>> +	bool extend = false, unaligned_io = false, overwrite = false;
>>  	bool ilock_shared = true;
>>  
>>  	/*
>> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  		return ext4_buffered_write_iter(iocb, from);
>>  	}
>>  
>> -	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
>> +	ret = ext4_dio_write_checks(iocb, from,
>> +				    &ilock_shared, &extend, &overwrite);
>>  	if (ret <= 0)
>>  		return ret;
>>  
>> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  		ext4_journal_stop(handle);
>>  	}
>>  
>> -	if (ilock_shared)
>> +	if (overwrite)
>>  		iomap_ops = &ext4_iomap_overwrite_ops;
>>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>>  			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
Zhang Yi Dec. 15, 2022, 8:41 a.m. UTC | #5
On 2022/12/15 2:52, Theodore Ts'o wrote:
> On Wed, Dec 14, 2022 at 06:01:25PM +0100, Jan Kara wrote:
>>
>> Besides some naming nits (see below) I think this should work. But I have
>> to say I'm a bit uneasy about this because we will now be changing block
>> mapping from unwritten to written only with shared i_rwsem. OTOH that
>> happens during writeback as well so we should be fine and the gain is very
>> nice.
> 
> Hmm.... when I was looking potential impacts of the change what
> ext4_overwrite_io() would do, I looked at the current user of that
> function in ext4_dio_write_checks().
> 
> 	/*
> 	 * Determine whether the IO operation will overwrite allocated
> 	 * and initialized blocks.
> 	 * We need exclusive i_rwsem for changing security info
> 	 * in file_modified().
> 	 */
> 	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> 	     !ext4_overwrite_io(inode, offset, count))) {
> 		if (iocb->ki_flags & IOCB_NOWAIT) {
> 			ret = -EAGAIN;
> 			goto out;
> 		}
> 		inode_unlock_shared(inode);
> 		*ilock_shared = false;
> 		inode_lock(inode);
> 		goto restart;
> 	}
> 
> 	ret = file_modified(file);
> 	if (ret < 0)
> 		goto out;
> 
> What is confusing me is the comment, "We need exclusive i_rwsem for
> changing security info in file_modified().".  But then we end up
> calling file_modified() unconditionally, regardless of whether we've
> transitioned from a shared lock to an exclusive lock.
> 
> So file_modified() can get called either with or without the inode
> locked r/w.  I realize that this patch doesn't change this
> inconsistency, but it appears either the comment is wrong, or the code
> is wrong.
> 
> What am I missing?
> 

IIUC, both of the comment and the code are correct, the __file_remove_privs()
in file_modified() should execute under exclusive lock, and we have already
check the IS_NOSEC(inode) and could make sure taking exclusive lock before we
remove privs. If we take share lock, __file_remove_privs() will return directly
because below check. So it's find now, but it looks that call file_update_time()
is enough for the shared lock case.

int file_update_time(struct file *file)
{
	if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
		return 0;
...
}

Thanks,
Yi.
Jan Kara Dec. 15, 2022, 8:48 a.m. UTC | #6
On Wed 14-12-22 13:52:32, Theodore Ts'o wrote:
> On Wed, Dec 14, 2022 at 06:01:25PM +0100, Jan Kara wrote:
> > 
> > Besides some naming nits (see below) I think this should work. But I have
> > to say I'm a bit uneasy about this because we will now be changing block
> > mapping from unwritten to written only with shared i_rwsem. OTOH that
> > happens during writeback as well so we should be fine and the gain is very
> > nice.
> 
> Hmm.... when I was looking potential impacts of the change what
> ext4_overwrite_io() would do, I looked at the current user of that
> function in ext4_dio_write_checks().
> 
> 	/*
> 	 * Determine whether the IO operation will overwrite allocated
> 	 * and initialized blocks.
> 	 * We need exclusive i_rwsem for changing security info
> 	 * in file_modified().
> 	 */
> 	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> 	     !ext4_overwrite_io(inode, offset, count))) {
> 		if (iocb->ki_flags & IOCB_NOWAIT) {
> 			ret = -EAGAIN;
> 			goto out;
> 		}
> 		inode_unlock_shared(inode);
> 		*ilock_shared = false;
> 		inode_lock(inode);
> 		goto restart;
> 	}
> 
> 	ret = file_modified(file);
> 	if (ret < 0)
> 		goto out;
> 
> What is confusing me is the comment, "We need exclusive i_rwsem for
> changing security info in file_modified().".  But then we end up
> calling file_modified() unconditionally, regardless of whether we've
> transitioned from a shared lock to an exclusive lock.
> 
> So file_modified() can get called either with or without the inode
> locked r/w.  I realize that this patch doesn't change this
> inconsistency, but it appears either the comment is wrong, or the code
> is wrong.

Maybe the comment needs rephrasing but it seems correct. file_modified()
does multiple things. It updates timestamps - these are fine with shared
i_rwsem - and is calls into __file_remove_privs() to remove SUID bits etc.
Now if __file_remove_privs() is going to modify the inode, we need i_rwsem
exclusively. And we determine whether __file_remove_privs() will do
anything by checking !IS_NOSEC(inode) in the condition above. So the
sentence you're confused about speaks about this part of the condition.

								Honza
Zhang Yi Dec. 15, 2022, 8:49 a.m. UTC | #7
On 2022/12/15 16:41, Zhang Yi wrote:
> On 2022/12/15 2:52, Theodore Ts'o wrote:
>> On Wed, Dec 14, 2022 at 06:01:25PM +0100, Jan Kara wrote:
>>>
>>> Besides some naming nits (see below) I think this should work. But I have
>>> to say I'm a bit uneasy about this because we will now be changing block
>>> mapping from unwritten to written only with shared i_rwsem. OTOH that
>>> happens during writeback as well so we should be fine and the gain is very
>>> nice.
>>
>> Hmm.... when I was looking potential impacts of the change what
>> ext4_overwrite_io() would do, I looked at the current user of that
>> function in ext4_dio_write_checks().
>>
>> 	/*
>> 	 * Determine whether the IO operation will overwrite allocated
>> 	 * and initialized blocks.
>> 	 * We need exclusive i_rwsem for changing security info
>> 	 * in file_modified().
>> 	 */
>> 	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
>> 	     !ext4_overwrite_io(inode, offset, count))) {
>> 		if (iocb->ki_flags & IOCB_NOWAIT) {
>> 			ret = -EAGAIN;
>> 			goto out;
>> 		}
>> 		inode_unlock_shared(inode);
>> 		*ilock_shared = false;
>> 		inode_lock(inode);
>> 		goto restart;
>> 	}
>>
>> 	ret = file_modified(file);
>> 	if (ret < 0)
>> 		goto out;
>>
>> What is confusing me is the comment, "We need exclusive i_rwsem for
>> changing security info in file_modified().".  But then we end up
>> calling file_modified() unconditionally, regardless of whether we've
>> transitioned from a shared lock to an exclusive lock.
>>
>> So file_modified() can get called either with or without the inode
>> locked r/w.  I realize that this patch doesn't change this
>> inconsistency, but it appears either the comment is wrong, or the code
>> is wrong.
>>
>> What am I missing?
>>
> 
> IIUC, both of the comment and the code are correct, the __file_remove_privs()
> in file_modified() should execute under exclusive lock, and we have already
> check the IS_NOSEC(inode) and could make sure taking exclusive lock before we
> remove privs. If we take share lock, __file_remove_privs() will return directly
> because below check. So it's find now, but it looks that call file_update_time()
> is enough for the shared lock case.
> 
> int file_update_time(struct file *file)
static int __file_remove_privs(struct file *file, unsigned int flags)
> {
...
> 	if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
> 		return 0;
> ...
> }
> 
> Thanks,
> Yi.
>
Jan Kara Dec. 15, 2022, 9 a.m. UTC | #8
On Thu 15-12-22 16:24:49, Zhang Yi wrote:
> On 2022/12/15 1:01, Jan Kara wrote:
> > On Sat 03-12-22 18:39:56, Zhang Yi wrote:
> >> In the dio write path, we only take shared inode lock for the case of
> >> aligned overwriting initialized blocks inside EOF. But for overwriting
> >> preallocated blocks, it may only need to split unwritten extents, this
> >> procedure has been protected under i_data_sem lock, it's safe to
> >> release the exclusive inode lock and take shared inode lock.
> >>
> >> This could give a significant speed up for multi-threaded writes. Test
> >> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
> >>
> >>  direct=1
> >>  ioengine=libaio
> >>  iodepth=10
> >>  numjobs=10
> >>  runtime=60
> >>  rw=randwrite
> >>  size=100G
> >>
> >> And the test result are:
> >> Before:
> >>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
> >>  bs=16k      IOPS=11.1k, BW=173MiB/s
> >>  bs=64k      IOPS=11.2k, BW=697MiB/s
> >>
> >> After:
> >>  bs=4k       IOPS=41.4k, BW=162MiB/s
> >>  bs=16k      IOPS=41.3k, BW=646MiB/s
> >>  bs=64k      IOPS=13.5k, BW=843MiB/s
> >>
> >> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> >> ---
> >>  It passed xfstests auto mode with 1k and 4k blocksize.
> > 
> > Besides some naming nits (see below) I think this should work. But I have
> > to say I'm a bit uneasy about this because we will now be changing block
> > mapping from unwritten to written only with shared i_rwsem. OTOH that
> > happens during writeback as well so we should be fine and the gain is very
> > nice.
> > 
> Thanks for advice, I will change the argument name to make it more reasonable.
> 
> > Out of curiosity do you have a real usecase for this?
> 
> No, I was just analyse the performance gap in our benchmark tests, and have
> some question and idea while reading the code. Maybe it could speed up the
> first time write in some database. :)
> 
> Besides, I want to discuss it a bit more. I originally changed this code to
> switch to take the shared inode and also use ext4_iomap_overwrite_ops for
> the overwriting preallocated blocks case. It will postpone the splitting extent
> procedure to endio and could give a much more gain than this patch (+~27%).
> 
> This patch:
>   bs=4k       IOPS=41.4k, BW=162MiB/s
> Postpone split:
>   bs=4k       IOPS=52.9k, BW=207MiB/s
> 
> But I think it's maybe too radical. I looked at the history and notice in
> commit 0031462b5b39 ("ext4: Split uninitialized extents for direct I/O"),
> it introduce the flag EXT4_GET_BLOCKS_DIO(now it had been renamed to
> EXT4_GET_BLOCKS_PRE_IO) to make sure that the preallocated unwritten
> extent have been splitted before submitting the I/O, which is used to
> prevent the ENOSPC problem if the filesystem run out-of-space in the
> endio procedure. And 4 years later, commit 27dd43854227 ("ext4: introduce
> reserved space") reserve some blocks for metedata allocation.  It looks
> like this commit could also slove the ENOSPC problem for most cases if we
> postpone extent splitting into the endio procedure. I don't find other
> side effect, so I think it may also fine if we do that. Do you have some
> advice or am I missing something?

So you are right these days splitting of extents could be done only on IO
completion because we have a pool of blocks reserved for these cases. OTOH
this will make the pressure on the reserved pool higher and if we are
running out of space and there are enough operations running in parallel we
*could* run out of reserved blocks. So I wouldn't always defer extent
splitting to IO completion unless we have a practical and sufficiently
widespread usecase that would benefit from this optimization.

								Honza
Zhang Yi Dec. 15, 2022, 9:21 a.m. UTC | #9
On 2022/12/15 17:00, Jan Kara wrote:
> On Thu 15-12-22 16:24:49, Zhang Yi wrote:
>> On 2022/12/15 1:01, Jan Kara wrote:
>>> On Sat 03-12-22 18:39:56, Zhang Yi wrote:
>>>> In the dio write path, we only take shared inode lock for the case of
>>>> aligned overwriting initialized blocks inside EOF. But for overwriting
>>>> preallocated blocks, it may only need to split unwritten extents, this
>>>> procedure has been protected under i_data_sem lock, it's safe to
>>>> release the exclusive inode lock and take shared inode lock.
>>>>
>>>> This could give a significant speed up for multi-threaded writes. Test
>>>> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>>>>
>>>>  direct=1
>>>>  ioengine=libaio
>>>>  iodepth=10
>>>>  numjobs=10
>>>>  runtime=60
>>>>  rw=randwrite
>>>>  size=100G
>>>>
>>>> And the test result are:
>>>> Before:
>>>>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>>>>  bs=16k      IOPS=11.1k, BW=173MiB/s
>>>>  bs=64k      IOPS=11.2k, BW=697MiB/s
>>>>
>>>> After:
>>>>  bs=4k       IOPS=41.4k, BW=162MiB/s
>>>>  bs=16k      IOPS=41.3k, BW=646MiB/s
>>>>  bs=64k      IOPS=13.5k, BW=843MiB/s
>>>>
>>>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>>>> ---
>>>>  It passed xfstests auto mode with 1k and 4k blocksize.
>>>
>>> Besides some naming nits (see below) I think this should work. But I have
>>> to say I'm a bit uneasy about this because we will now be changing block
>>> mapping from unwritten to written only with shared i_rwsem. OTOH that
>>> happens during writeback as well so we should be fine and the gain is very
>>> nice.
>>>
>> Thanks for advice, I will change the argument name to make it more reasonable.
>>
>>> Out of curiosity do you have a real usecase for this?
>>
>> No, I was just analyse the performance gap in our benchmark tests, and have
>> some question and idea while reading the code. Maybe it could speed up the
>> first time write in some database. :)
>>
>> Besides, I want to discuss it a bit more. I originally changed this code to
>> switch to take the shared inode and also use ext4_iomap_overwrite_ops for
>> the overwriting preallocated blocks case. It will postpone the splitting extent
>> procedure to endio and could give a much more gain than this patch (+~27%).
>>
>> This patch:
>>   bs=4k       IOPS=41.4k, BW=162MiB/s
>> Postpone split:
>>   bs=4k       IOPS=52.9k, BW=207MiB/s
>>
>> But I think it's maybe too radical. I looked at the history and notice in
>> commit 0031462b5b39 ("ext4: Split uninitialized extents for direct I/O"),
>> it introduce the flag EXT4_GET_BLOCKS_DIO(now it had been renamed to
>> EXT4_GET_BLOCKS_PRE_IO) to make sure that the preallocated unwritten
>> extent have been splitted before submitting the I/O, which is used to
>> prevent the ENOSPC problem if the filesystem run out-of-space in the
>> endio procedure. And 4 years later, commit 27dd43854227 ("ext4: introduce
>> reserved space") reserve some blocks for metedata allocation.  It looks
>> like this commit could also slove the ENOSPC problem for most cases if we
>> postpone extent splitting into the endio procedure. I don't find other
>> side effect, so I think it may also fine if we do that. Do you have some
>> advice or am I missing something?
> 
> So you are right these days splitting of extents could be done only on IO
> completion because we have a pool of blocks reserved for these cases. OTOH
> this will make the pressure on the reserved pool higher and if we are
> running out of space and there are enough operations running in parallel we
> *could* run out of reserved blocks. So I wouldn't always defer extent
> splitting to IO completion unless we have a practical and sufficiently
> widespread usecase that would benefit from this optimization.
> 

Sure, I think so, thanks for advice.

Yi.
diff mbox series

Patch

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a7a597c727e6..7edac94025ac 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,8 +202,9 @@  ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
 	return false;
 }
 
-/* Is IO overwriting allocated and initialized blocks? */
-static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+/* Is IO overwriting allocated or initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode,
+			      loff_t pos, loff_t len, bool *inited)
 {
 	struct ext4_map_blocks map;
 	unsigned int blkbits = inode->i_blkbits;
@@ -217,12 +218,15 @@  static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
 	blklen = map.m_len;
 
 	err = ext4_map_blocks(NULL, inode, &map, 0);
+	if (err != blklen)
+		return false;
 	/*
 	 * 'err==len' means that all of the blocks have been preallocated,
-	 * regardless of whether they have been initialized or not. To exclude
-	 * unwritten extents, we need to check m_flags.
+	 * regardless of whether they have been initialized or not. We need to
+	 * check m_flags to distinguish the unwritten extents.
 	 */
-	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+	*inited = !!(map.m_flags & EXT4_MAP_MAPPED);
+	return true;
 }
 
 static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
@@ -431,11 +435,16 @@  static const struct iomap_dio_ops ext4_dio_write_ops = {
  * - For extending writes case we don't take the shared lock, since it requires
  *   updating inode i_disksize and/or orphan handling with exclusive lock.
  *
- * - shared locking will only be true mostly with overwrites. Otherwise we will
- *   switch to exclusive i_rwsem lock.
+ * - shared locking will only be true mostly with overwrites, include
+ *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
+ *   we protects splitting extents by i_data_sem in ext4_inode_info, so we can
+ *   also release exclusive i_rwsem lock.
+ *
+ * - Otherwise we will switch to exclusive i_rwsem lock.
  */
 static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
-				     bool *ilock_shared, bool *extend)
+				     bool *ilock_shared, bool *extend,
+				     bool *overwrite)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
@@ -459,7 +468,7 @@  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
 	 * in file_modified().
 	 */
 	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
-	     !ext4_overwrite_io(inode, offset, count))) {
+	     !ext4_overwrite_io(inode, offset, count, overwrite))) {
 		if (iocb->ki_flags & IOCB_NOWAIT) {
 			ret = -EAGAIN;
 			goto out;
@@ -491,7 +500,7 @@  static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	loff_t offset = iocb->ki_pos;
 	size_t count = iov_iter_count(from);
 	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
-	bool extend = false, unaligned_io = false;
+	bool extend = false, unaligned_io = false, overwrite = false;
 	bool ilock_shared = true;
 
 	/*
@@ -534,7 +543,8 @@  static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return ext4_buffered_write_iter(iocb, from);
 	}
 
-	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+	ret = ext4_dio_write_checks(iocb, from,
+				    &ilock_shared, &extend, &overwrite);
 	if (ret <= 0)
 		return ret;
 
@@ -582,7 +592,7 @@  static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ext4_journal_stop(handle);
 	}
 
-	if (ilock_shared)
+	if (overwrite)
 		iomap_ops = &ext4_iomap_overwrite_ops;
 	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
 			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,