diff mbox

[7/7] block/raw-posix: set max_write_zeroes to INT_MAX for regular files

Message ID 54CF85BE.6030302@kamp.de
State New
Headers show

Commit Message

Peter Lieven Feb. 2, 2015, 2:12 p.m. UTC
Am 02.02.2015 um 15:04 schrieb Kevin Wolf:
> Am 02.02.2015 um 14:55 hat Peter Lieven geschrieben:
>> Am 02.02.2015 um 14:23 schrieb Kevin Wolf:
>>> Am 30.01.2015 um 09:42 hat Denis V. Lunev geschrieben:
>>>> fallocate() works fine and could handle properly with arbitrary size
>>>> requests. There is no sense to reduce the amount of space to fallocate.
>>>> The bigger is the size, the better is the performance as the amount of
>>>> journal updates is reduced.
>>>>
>>>> The patch changes behavior for both generic filesystem and XFS codepaths,
>>>> which are different in handle_aiocb_write_zeroes. The implementation
>>>> of fallocate and xfsctl(XFS_IOC_ZERO_RANGE) for XFS are exactly the same
>>>> thus the change is fine for both ways.
>>>>
>>>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>> CC: Kevin Wolf <kwolf@redhat.com>
>>>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>>>> CC: Peter Lieven <pl@kamp.de>
>>>> CC: Fam Zheng <famz@redhat.com>
>>>> ---
>>>>   block/raw-posix.c | 17 +++++++++++++++++
>>>>   1 file changed, 17 insertions(+)
>>>>
>>>> diff --git a/block/raw-posix.c b/block/raw-posix.c
>>>> index 7b42f37..933c778 100644
>>>> --- a/block/raw-posix.c
>>>> +++ b/block/raw-posix.c
>>>> @@ -293,6 +293,20 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>>>>       }
>>>>   }
>>>> +static void raw_probe_max_write_zeroes(BlockDriverState *bs)
>>>> +{
>>>> +    BDRVRawState *s = bs->opaque;
>>>> +    struct stat st;
>>>> +
>>>> +    if (fstat(s->fd, &st) < 0) {
>>>> +        return; /* no problem, keep default value */
>>>> +    }
>>>> +    if (!S_ISREG(st.st_mode) || !s->discard_zeroes) {
>>>> +        return;
>>>> +    }
>>>> +    bs->bl.max_write_zeroes = INT_MAX;
>>>> +}
>>> Peter, do you remember why INT_MAX isn't actually the default? I think
>>> the most reasonable behaviour would be that a limitation is only used if
>>> a block driver requests it, and otherwise unlimited is assumed.
>> The default (0) actually means unlimited or undefined. We introduced
>> that limit of 16MB in bdrv_co_write_zeroes to create only reasonable
>> sized requests because there is no guarantee that write zeroes is a
>> fast operation. We should set INT_MAX only if we know that write
>> zeroes of an arbitrary size is always fast.
> Well, splitting it up doesn't make it any faster. I think we can assume
> that drv->bdrv_co_write_zeroes() wants to know the full request size
> unless the driver has explicitly set bs->bl.max_write_zeroes.

You mean sth like this:




Peter

Comments

Kevin Wolf Feb. 2, 2015, 2:16 p.m. UTC | #1
Am 02.02.2015 um 15:12 hat Peter Lieven geschrieben:
> Am 02.02.2015 um 15:04 schrieb Kevin Wolf:
> >Am 02.02.2015 um 14:55 hat Peter Lieven geschrieben:
> >>Am 02.02.2015 um 14:23 schrieb Kevin Wolf:
> >>>Am 30.01.2015 um 09:42 hat Denis V. Lunev geschrieben:
> >>>>fallocate() works fine and could handle properly with arbitrary size
> >>>>requests. There is no sense to reduce the amount of space to fallocate.
> >>>>The bigger is the size, the better is the performance as the amount of
> >>>>journal updates is reduced.
> >>>>
> >>>>The patch changes behavior for both generic filesystem and XFS codepaths,
> >>>>which are different in handle_aiocb_write_zeroes. The implementation
> >>>>of fallocate and xfsctl(XFS_IOC_ZERO_RANGE) for XFS are exactly the same
> >>>>thus the change is fine for both ways.
> >>>>
> >>>>Signed-off-by: Denis V. Lunev <den@openvz.org>
> >>>>Reviewed-by: Max Reitz <mreitz@redhat.com>
> >>>>CC: Kevin Wolf <kwolf@redhat.com>
> >>>>CC: Stefan Hajnoczi <stefanha@redhat.com>
> >>>>CC: Peter Lieven <pl@kamp.de>
> >>>>CC: Fam Zheng <famz@redhat.com>
> >>>>---
> >>>>  block/raw-posix.c | 17 +++++++++++++++++
> >>>>  1 file changed, 17 insertions(+)
> >>>>
> >>>>diff --git a/block/raw-posix.c b/block/raw-posix.c
> >>>>index 7b42f37..933c778 100644
> >>>>--- a/block/raw-posix.c
> >>>>+++ b/block/raw-posix.c
> >>>>@@ -293,6 +293,20 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
> >>>>      }
> >>>>  }
> >>>>+static void raw_probe_max_write_zeroes(BlockDriverState *bs)
> >>>>+{
> >>>>+    BDRVRawState *s = bs->opaque;
> >>>>+    struct stat st;
> >>>>+
> >>>>+    if (fstat(s->fd, &st) < 0) {
> >>>>+        return; /* no problem, keep default value */
> >>>>+    }
> >>>>+    if (!S_ISREG(st.st_mode) || !s->discard_zeroes) {
> >>>>+        return;
> >>>>+    }
> >>>>+    bs->bl.max_write_zeroes = INT_MAX;
> >>>>+}
> >>>Peter, do you remember why INT_MAX isn't actually the default? I think
> >>>the most reasonable behaviour would be that a limitation is only used if
> >>>a block driver requests it, and otherwise unlimited is assumed.
> >>The default (0) actually means unlimited or undefined. We introduced
> >>that limit of 16MB in bdrv_co_write_zeroes to create only reasonable
> >>sized requests because there is no guarantee that write zeroes is a
> >>fast operation. We should set INT_MAX only if we know that write
> >>zeroes of an arbitrary size is always fast.
> >Well, splitting it up doesn't make it any faster. I think we can assume
> >that drv->bdrv_co_write_zeroes() wants to know the full request size
> >unless the driver has explicitly set bs->bl.max_write_zeroes.
> 
> You mean sth like this:

Yes, I think that's what I meant.

Kevin

> diff --git a/block.c b/block.c
> index 61412e9..8272ef9 100644
> --- a/block.c
> +++ b/block.c
> @@ -3192,10 +3192,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
>                              BDRV_REQ_COPY_ON_READ);
>  }
> 
> -/* if no limit is specified in the BlockLimits use a default
> - * of 32768 512-byte sectors (16 MiB) per request.
> - */
> -#define MAX_WRITE_ZEROES_DEFAULT 32768
> +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
> 
>  static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>      int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
> @@ -3206,7 +3203,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>      int ret = 0;
> 
>      int max_write_zeroes = bs->bl.max_write_zeroes ?
> -                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
> +                           bs->bl.max_write_zeroes : INT_MAX;
> 
>      while (nb_sectors > 0 && !ret) {
>          int num = nb_sectors;
> @@ -3242,7 +3239,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>          if (ret == -ENOTSUP) {
>              /* Fall back to bounce buffer if write zeroes is unsupported */
>              int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
> - MAX_WRITE_ZEROES_DEFAULT);
> + MAX_WRITE_ZEROES_BOUNCE_BUFFER);
>              num = MIN(num, max_xfer_len);
>              iov.iov_len = num * BDRV_SECTOR_SIZE;
>              if (iov.iov_base == NULL) {
> @@ -5099,11 +5096,6 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
>      rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
>  }
> 
> -/* if no limit is specified in the BlockLimits use a default
> - * of 32768 512-byte sectors (16 MiB) per request.
> - */
> -#define MAX_DISCARD_DEFAULT 32768
> -
>  int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
>                                   int nb_sectors)
>  {
> @@ -5128,7 +5120,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
>          return 0;
>      }
> 
> -    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
> +    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : INT_MAX;
>      while (nb_sectors > 0) {
>          int ret;
>          int num = nb_sectors;
> 
> 
> 
> Peter
Peter Lieven Feb. 2, 2015, 2:20 p.m. UTC | #2
Am 02.02.2015 um 15:16 schrieb Kevin Wolf:
> Am 02.02.2015 um 15:12 hat Peter Lieven geschrieben:
>> Am 02.02.2015 um 15:04 schrieb Kevin Wolf:
>>> Am 02.02.2015 um 14:55 hat Peter Lieven geschrieben:
>>>> Am 02.02.2015 um 14:23 schrieb Kevin Wolf:
>>>>> Am 30.01.2015 um 09:42 hat Denis V. Lunev geschrieben:
>>>>>> fallocate() works fine and could handle properly with arbitrary size
>>>>>> requests. There is no sense to reduce the amount of space to fallocate.
>>>>>> The bigger is the size, the better is the performance as the amount of
>>>>>> journal updates is reduced.
>>>>>>
>>>>>> The patch changes behavior for both generic filesystem and XFS codepaths,
>>>>>> which are different in handle_aiocb_write_zeroes. The implementation
>>>>>> of fallocate and xfsctl(XFS_IOC_ZERO_RANGE) for XFS are exactly the same
>>>>>> thus the change is fine for both ways.
>>>>>>
>>>>>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>>>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>>>> CC: Kevin Wolf <kwolf@redhat.com>
>>>>>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>>>>>> CC: Peter Lieven <pl@kamp.de>
>>>>>> CC: Fam Zheng <famz@redhat.com>
>>>>>> ---
>>>>>>   block/raw-posix.c | 17 +++++++++++++++++
>>>>>>   1 file changed, 17 insertions(+)
>>>>>>
>>>>>> diff --git a/block/raw-posix.c b/block/raw-posix.c
>>>>>> index 7b42f37..933c778 100644
>>>>>> --- a/block/raw-posix.c
>>>>>> +++ b/block/raw-posix.c
>>>>>> @@ -293,6 +293,20 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>>>>>>       }
>>>>>>   }
>>>>>> +static void raw_probe_max_write_zeroes(BlockDriverState *bs)
>>>>>> +{
>>>>>> +    BDRVRawState *s = bs->opaque;
>>>>>> +    struct stat st;
>>>>>> +
>>>>>> +    if (fstat(s->fd, &st) < 0) {
>>>>>> +        return; /* no problem, keep default value */
>>>>>> +    }
>>>>>> +    if (!S_ISREG(st.st_mode) || !s->discard_zeroes) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +    bs->bl.max_write_zeroes = INT_MAX;
>>>>>> +}
>>>>> Peter, do you remember why INT_MAX isn't actually the default? I think
>>>>> the most reasonable behaviour would be that a limitation is only used if
>>>>> a block driver requests it, and otherwise unlimited is assumed.
>>>> The default (0) actually means unlimited or undefined. We introduced
>>>> that limit of 16MB in bdrv_co_write_zeroes to create only reasonable
>>>> sized requests because there is no guarantee that write zeroes is a
>>>> fast operation. We should set INT_MAX only if we know that write
>>>> zeroes of an arbitrary size is always fast.
>>> Well, splitting it up doesn't make it any faster. I think we can assume
>>> that drv->bdrv_co_write_zeroes() wants to know the full request size
>>> unless the driver has explicitly set bs->bl.max_write_zeroes.
>> You mean sth like this:
> Yes, I think that's what I meant.

I can't find the original discussion why we added this limit. It was actually the default
before we introduced BlockLimits. And, it was also the default in the unsupported path
of write zeroes which created big memory allocations. This might be the reason why
we introduced a limit.

Peter

>
> Kevin
>
>> diff --git a/block.c b/block.c
>> index 61412e9..8272ef9 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -3192,10 +3192,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
>>                               BDRV_REQ_COPY_ON_READ);
>>   }
>>
>> -/* if no limit is specified in the BlockLimits use a default
>> - * of 32768 512-byte sectors (16 MiB) per request.
>> - */
>> -#define MAX_WRITE_ZEROES_DEFAULT 32768
>> +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
>>
>>   static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>>       int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
>> @@ -3206,7 +3203,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>>       int ret = 0;
>>
>>       int max_write_zeroes = bs->bl.max_write_zeroes ?
>> -                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
>> +                           bs->bl.max_write_zeroes : INT_MAX;
>>
>>       while (nb_sectors > 0 && !ret) {
>>           int num = nb_sectors;
>> @@ -3242,7 +3239,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>>           if (ret == -ENOTSUP) {
>>               /* Fall back to bounce buffer if write zeroes is unsupported */
>>               int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
>> - MAX_WRITE_ZEROES_DEFAULT);
>> + MAX_WRITE_ZEROES_BOUNCE_BUFFER);
>>               num = MIN(num, max_xfer_len);
>>               iov.iov_len = num * BDRV_SECTOR_SIZE;
>>               if (iov.iov_base == NULL) {
>> @@ -5099,11 +5096,6 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
>>       rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
>>   }
>>
>> -/* if no limit is specified in the BlockLimits use a default
>> - * of 32768 512-byte sectors (16 MiB) per request.
>> - */
>> -#define MAX_DISCARD_DEFAULT 32768
>> -
>>   int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
>>                                    int nb_sectors)
>>   {
>> @@ -5128,7 +5120,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
>>           return 0;
>>       }
>>
>> -    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
>> +    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : INT_MAX;
>>       while (nb_sectors > 0) {
>>           int ret;
>>           int num = nb_sectors;
>>
>>
>>
>> Peter
Denis V. Lunev Feb. 2, 2015, 2:38 p.m. UTC | #3
On 02/02/15 17:20, Peter Lieven wrote:
> Am 02.02.2015 um 15:16 schrieb Kevin Wolf:
>> Am 02.02.2015 um 15:12 hat Peter Lieven geschrieben:
>>> Am 02.02.2015 um 15:04 schrieb Kevin Wolf:
>>>> Am 02.02.2015 um 14:55 hat Peter Lieven geschrieben:
>>>>> Am 02.02.2015 um 14:23 schrieb Kevin Wolf:
>>>>>> Am 30.01.2015 um 09:42 hat Denis V. Lunev geschrieben:
>>>>>>> fallocate() works fine and could handle properly with arbitrary 
>>>>>>> size
>>>>>>> requests. There is no sense to reduce the amount of space to 
>>>>>>> fallocate.
>>>>>>> The bigger is the size, the better is the performance as the 
>>>>>>> amount of
>>>>>>> journal updates is reduced.
>>>>>>>
>>>>>>> The patch changes behavior for both generic filesystem and XFS 
>>>>>>> codepaths,
>>>>>>> which are different in handle_aiocb_write_zeroes. The 
>>>>>>> implementation
>>>>>>> of fallocate and xfsctl(XFS_IOC_ZERO_RANGE) for XFS are exactly 
>>>>>>> the same
>>>>>>> thus the change is fine for both ways.
>>>>>>>
>>>>>>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>>>>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>>>>> CC: Kevin Wolf <kwolf@redhat.com>
>>>>>>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>>>>>>> CC: Peter Lieven <pl@kamp.de>
>>>>>>> CC: Fam Zheng <famz@redhat.com>
>>>>>>> ---
>>>>>>>   block/raw-posix.c | 17 +++++++++++++++++
>>>>>>>   1 file changed, 17 insertions(+)
>>>>>>>
>>>>>>> diff --git a/block/raw-posix.c b/block/raw-posix.c
>>>>>>> index 7b42f37..933c778 100644
>>>>>>> --- a/block/raw-posix.c
>>>>>>> +++ b/block/raw-posix.c
>>>>>>> @@ -293,6 +293,20 @@ static void 
>>>>>>> raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>>>>>>>       }
>>>>>>>   }
>>>>>>> +static void raw_probe_max_write_zeroes(BlockDriverState *bs)
>>>>>>> +{
>>>>>>> +    BDRVRawState *s = bs->opaque;
>>>>>>> +    struct stat st;
>>>>>>> +
>>>>>>> +    if (fstat(s->fd, &st) < 0) {
>>>>>>> +        return; /* no problem, keep default value */
>>>>>>> +    }
>>>>>>> +    if (!S_ISREG(st.st_mode) || !s->discard_zeroes) {
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +    bs->bl.max_write_zeroes = INT_MAX;
>>>>>>> +}
>>>>>> Peter, do you remember why INT_MAX isn't actually the default? I 
>>>>>> think
>>>>>> the most reasonable behaviour would be that a limitation is only 
>>>>>> used if
>>>>>> a block driver requests it, and otherwise unlimited is assumed.
>>>>> The default (0) actually means unlimited or undefined. We introduced
>>>>> that limit of 16MB in bdrv_co_write_zeroes to create only reasonable
>>>>> sized requests because there is no guarantee that write zeroes is a
>>>>> fast operation. We should set INT_MAX only if we know that write
>>>>> zeroes of an arbitrary size is always fast.
>>>> Well, splitting it up doesn't make it any faster. I think we can 
>>>> assume
>>>> that drv->bdrv_co_write_zeroes() wants to know the full request size
>>>> unless the driver has explicitly set bs->bl.max_write_zeroes.
>>> You mean sth like this:
>> Yes, I think that's what I meant.
>
> I can't find the original discussion why we added this limit. It was 
> actually the default
> before we introduced BlockLimits. And, it was also the default in the 
> unsupported path
> of write zeroes which created big memory allocations. This might be 
> the reason why
> we introduced a limit.
>
> Peter
>
my $0.02 here is that even if the patch below adds regression
(though I can not imagine how at the moment after some
checking), we should fix bogus driver.

Personally I do not like such unnatural limitations.

Den

>>
>> Kevin
>>
>>> diff --git a/block.c b/block.c
>>> index 61412e9..8272ef9 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -3192,10 +3192,7 @@ int coroutine_fn 
>>> bdrv_co_copy_on_readv(BlockDriverState *bs,
>>>                               BDRV_REQ_COPY_ON_READ);
>>>   }
>>>
>>> -/* if no limit is specified in the BlockLimits use a default
>>> - * of 32768 512-byte sectors (16 MiB) per request.
>>> - */
>>> -#define MAX_WRITE_ZEROES_DEFAULT 32768
>>> +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
>>>
>>>   static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>>>       int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
>>> @@ -3206,7 +3203,7 @@ static int coroutine_fn 
>>> bdrv_co_do_write_zeroes(BlockDriverState *bs,
>>>       int ret = 0;
>>>
>>>       int max_write_zeroes = bs->bl.max_write_zeroes ?
>>> -                           bs->bl.max_write_zeroes : 
>>> MAX_WRITE_ZEROES_DEFAULT;
>>> +                           bs->bl.max_write_zeroes : INT_MAX;
>>>
>>>       while (nb_sectors > 0 && !ret) {
>>>           int num = nb_sectors;
>>> @@ -3242,7 +3239,7 @@ static int coroutine_fn 
>>> bdrv_co_do_write_zeroes(BlockDriverState *bs,
>>>           if (ret == -ENOTSUP) {
>>>               /* Fall back to bounce buffer if write zeroes is 
>>> unsupported */
>>>               int max_xfer_len = 
>>> MIN_NON_ZERO(bs->bl.max_transfer_length,
>>> - MAX_WRITE_ZEROES_DEFAULT);
>>> + MAX_WRITE_ZEROES_BOUNCE_BUFFER);
>>>               num = MIN(num, max_xfer_len);
>>>               iov.iov_len = num * BDRV_SECTOR_SIZE;
>>>               if (iov.iov_base == NULL) {
>>> @@ -5099,11 +5096,6 @@ static void coroutine_fn 
>>> bdrv_discard_co_entry(void *opaque)
>>>       rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, 
>>> rwco->nb_sectors);
>>>   }
>>>
>>> -/* if no limit is specified in the BlockLimits use a default
>>> - * of 32768 512-byte sectors (16 MiB) per request.
>>> - */
>>> -#define MAX_DISCARD_DEFAULT 32768
>>> -
>>>   int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t 
>>> sector_num,
>>>                                    int nb_sectors)
>>>   {
>>> @@ -5128,7 +5120,7 @@ int coroutine_fn 
>>> bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
>>>           return 0;
>>>       }
>>>
>>> -    max_discard = bs->bl.max_discard ? bs->bl.max_discard : 
>>> MAX_DISCARD_DEFAULT;
>>> +    max_discard = bs->bl.max_discard ? bs->bl.max_discard : INT_MAX;
>>>       while (nb_sectors > 0) {
>>>           int ret;
>>>           int num = nb_sectors;
>>>
>>>
>>>
>>> Peter
>
>
Kevin Wolf Feb. 2, 2015, 2:49 p.m. UTC | #4
Am 02.02.2015 um 15:20 hat Peter Lieven geschrieben:
> Am 02.02.2015 um 15:16 schrieb Kevin Wolf:
> >Am 02.02.2015 um 15:12 hat Peter Lieven geschrieben:
> >>Am 02.02.2015 um 15:04 schrieb Kevin Wolf:
> >>>Am 02.02.2015 um 14:55 hat Peter Lieven geschrieben:
> >>>>Am 02.02.2015 um 14:23 schrieb Kevin Wolf:
> >>>>>Am 30.01.2015 um 09:42 hat Denis V. Lunev geschrieben:
> >>>>>>fallocate() works fine and could handle properly with arbitrary size
> >>>>>>requests. There is no sense to reduce the amount of space to fallocate.
> >>>>>>The bigger is the size, the better is the performance as the amount of
> >>>>>>journal updates is reduced.
> >>>>>>
> >>>>>>The patch changes behavior for both generic filesystem and XFS codepaths,
> >>>>>>which are different in handle_aiocb_write_zeroes. The implementation
> >>>>>>of fallocate and xfsctl(XFS_IOC_ZERO_RANGE) for XFS are exactly the same
> >>>>>>thus the change is fine for both ways.
> >>>>>>
> >>>>>>Signed-off-by: Denis V. Lunev <den@openvz.org>
> >>>>>>Reviewed-by: Max Reitz <mreitz@redhat.com>
> >>>>>>CC: Kevin Wolf <kwolf@redhat.com>
> >>>>>>CC: Stefan Hajnoczi <stefanha@redhat.com>
> >>>>>>CC: Peter Lieven <pl@kamp.de>
> >>>>>>CC: Fam Zheng <famz@redhat.com>
> >>>>>>---
> >>>>>>  block/raw-posix.c | 17 +++++++++++++++++
> >>>>>>  1 file changed, 17 insertions(+)
> >>>>>>
> >>>>>>diff --git a/block/raw-posix.c b/block/raw-posix.c
> >>>>>>index 7b42f37..933c778 100644
> >>>>>>--- a/block/raw-posix.c
> >>>>>>+++ b/block/raw-posix.c
> >>>>>>@@ -293,6 +293,20 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
> >>>>>>      }
> >>>>>>  }
> >>>>>>+static void raw_probe_max_write_zeroes(BlockDriverState *bs)
> >>>>>>+{
> >>>>>>+    BDRVRawState *s = bs->opaque;
> >>>>>>+    struct stat st;
> >>>>>>+
> >>>>>>+    if (fstat(s->fd, &st) < 0) {
> >>>>>>+        return; /* no problem, keep default value */
> >>>>>>+    }
> >>>>>>+    if (!S_ISREG(st.st_mode) || !s->discard_zeroes) {
> >>>>>>+        return;
> >>>>>>+    }
> >>>>>>+    bs->bl.max_write_zeroes = INT_MAX;
> >>>>>>+}
> >>>>>Peter, do you remember why INT_MAX isn't actually the default? I think
> >>>>>the most reasonable behaviour would be that a limitation is only used if
> >>>>>a block driver requests it, and otherwise unlimited is assumed.
> >>>>The default (0) actually means unlimited or undefined. We introduced
> >>>>that limit of 16MB in bdrv_co_write_zeroes to create only reasonable
> >>>>sized requests because there is no guarantee that write zeroes is a
> >>>>fast operation. We should set INT_MAX only if we know that write
> >>>>zeroes of an arbitrary size is always fast.
> >>>Well, splitting it up doesn't make it any faster. I think we can assume
> >>>that drv->bdrv_co_write_zeroes() wants to know the full request size
> >>>unless the driver has explicitly set bs->bl.max_write_zeroes.
> >>You mean sth like this:
> >Yes, I think that's what I meant.
> 
> I can't find the original discussion why we added this limit. It was actually the default
> before we introduced BlockLimits. And, it was also the default in the unsupported path
> of write zeroes which created big memory allocations. This might be the reason why
> we introduced a limit.

Commit c31cb707 added the limit to bdrv_co_do_write_zeroes(). Before, we
used a bounce buffer of unbounded size.

Anyway, it seems that none of us can think of a reason not to apply the
patch to block.c. Let's just do it, and if it does break something,
we'll figure it out. Can you send it as a proper patch?

Denis, if we apply that patch, would you be okay with dropping 7/7 from
this series, or would still something be missing?

Kevin

> >>diff --git a/block.c b/block.c
> >>index 61412e9..8272ef9 100644
> >>--- a/block.c
> >>+++ b/block.c
> >>@@ -3192,10 +3192,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
> >>                              BDRV_REQ_COPY_ON_READ);
> >>  }
> >>
> >>-/* if no limit is specified in the BlockLimits use a default
> >>- * of 32768 512-byte sectors (16 MiB) per request.
> >>- */
> >>-#define MAX_WRITE_ZEROES_DEFAULT 32768
> >>+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
> >>
> >>  static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
> >>      int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
> >>@@ -3206,7 +3203,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
> >>      int ret = 0;
> >>
> >>      int max_write_zeroes = bs->bl.max_write_zeroes ?
> >>-                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
> >>+                           bs->bl.max_write_zeroes : INT_MAX;
> >>
> >>      while (nb_sectors > 0 && !ret) {
> >>          int num = nb_sectors;
> >>@@ -3242,7 +3239,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
> >>          if (ret == -ENOTSUP) {
> >>              /* Fall back to bounce buffer if write zeroes is unsupported */
> >>              int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
> >>- MAX_WRITE_ZEROES_DEFAULT);
> >>+ MAX_WRITE_ZEROES_BOUNCE_BUFFER);
> >>              num = MIN(num, max_xfer_len);
> >>              iov.iov_len = num * BDRV_SECTOR_SIZE;
> >>              if (iov.iov_base == NULL) {
> >>@@ -5099,11 +5096,6 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
> >>      rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
> >>  }
> >>
> >>-/* if no limit is specified in the BlockLimits use a default
> >>- * of 32768 512-byte sectors (16 MiB) per request.
> >>- */
> >>-#define MAX_DISCARD_DEFAULT 32768
> >>-
> >>  int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
> >>                                   int nb_sectors)
> >>  {
> >>@@ -5128,7 +5120,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
> >>          return 0;
> >>      }
> >>
> >>-    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
> >>+    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : INT_MAX;
> >>      while (nb_sectors > 0) {
> >>          int ret;
> >>          int num = nb_sectors;
> >>
> >>
> >>
> >>Peter
> 
>
Denis V. Lunev Feb. 2, 2015, 3:30 p.m. UTC | #5
On 02/02/15 17:49, Kevin Wolf wrote:
> Am 02.02.2015 um 15:20 hat Peter Lieven geschrieben:
>> Am 02.02.2015 um 15:16 schrieb Kevin Wolf:
>>> Am 02.02.2015 um 15:12 hat Peter Lieven geschrieben:
>>>> Am 02.02.2015 um 15:04 schrieb Kevin Wolf:
>>>>> Am 02.02.2015 um 14:55 hat Peter Lieven geschrieben:
>>>>>> Am 02.02.2015 um 14:23 schrieb Kevin Wolf:
>>>>>>> Am 30.01.2015 um 09:42 hat Denis V. Lunev geschrieben:
>>>>>>>> fallocate() works fine and could handle properly with arbitrary size
>>>>>>>> requests. There is no sense to reduce the amount of space to fallocate.
>>>>>>>> The bigger is the size, the better is the performance as the amount of
>>>>>>>> journal updates is reduced.
>>>>>>>>
>>>>>>>> The patch changes behavior for both generic filesystem and XFS codepaths,
>>>>>>>> which are different in handle_aiocb_write_zeroes. The implementation
>>>>>>>> of fallocate and xfsctl(XFS_IOC_ZERO_RANGE) for XFS are exactly the same
>>>>>>>> thus the change is fine for both ways.
>>>>>>>>
>>>>>>>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>>>>>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>>>>>> CC: Kevin Wolf <kwolf@redhat.com>
>>>>>>>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>>>>>>>> CC: Peter Lieven <pl@kamp.de>
>>>>>>>> CC: Fam Zheng <famz@redhat.com>
>>>>>>>> ---
>>>>>>>>   block/raw-posix.c | 17 +++++++++++++++++
>>>>>>>>   1 file changed, 17 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/block/raw-posix.c b/block/raw-posix.c
>>>>>>>> index 7b42f37..933c778 100644
>>>>>>>> --- a/block/raw-posix.c
>>>>>>>> +++ b/block/raw-posix.c
>>>>>>>> @@ -293,6 +293,20 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>>>>>>>>       }
>>>>>>>>   }
>>>>>>>> +static void raw_probe_max_write_zeroes(BlockDriverState *bs)
>>>>>>>> +{
>>>>>>>> +    BDRVRawState *s = bs->opaque;
>>>>>>>> +    struct stat st;
>>>>>>>> +
>>>>>>>> +    if (fstat(s->fd, &st) < 0) {
>>>>>>>> +        return; /* no problem, keep default value */
>>>>>>>> +    }
>>>>>>>> +    if (!S_ISREG(st.st_mode) || !s->discard_zeroes) {
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +    bs->bl.max_write_zeroes = INT_MAX;
>>>>>>>> +}
>>>>>>> Peter, do you remember why INT_MAX isn't actually the default? I think
>>>>>>> the most reasonable behaviour would be that a limitation is only used if
>>>>>>> a block driver requests it, and otherwise unlimited is assumed.
>>>>>> The default (0) actually means unlimited or undefined. We introduced
>>>>>> that limit of 16MB in bdrv_co_write_zeroes to create only reasonable
>>>>>> sized requests because there is no guarantee that write zeroes is a
>>>>>> fast operation. We should set INT_MAX only if we know that write
>>>>>> zeroes of an arbitrary size is always fast.
>>>>> Well, splitting it up doesn't make it any faster. I think we can assume
>>>>> that drv->bdrv_co_write_zeroes() wants to know the full request size
>>>>> unless the driver has explicitly set bs->bl.max_write_zeroes.
>>>> You mean sth like this:
>>> Yes, I think that's what I meant.
>> I can't find the original discussion why we added this limit. It was actually the default
>> before we introduced BlockLimits. And, it was also the default in the unsupported path
>> of write zeroes which created big memory allocations. This might be the reason why
>> we introduced a limit.
> Commit c31cb707 added the limit to bdrv_co_do_write_zeroes(). Before, we
> used a bounce buffer of unbounded size.
>
> Anyway, it seems that none of us can think of a reason not to apply the
> patch to block.c. Let's just do it, and if it does break something,
> we'll figure it out. Can you send it as a proper patch?
>
> Denis, if we apply that patch, would you be okay with dropping 7/7 from
> this series, or would still something be missing?
>
> Kevin
Sure. This will be even better. Something similar was implemented in
v1/v2 of the patchset.

Regards,
     Den
diff mbox

Patch

diff --git a/block.c b/block.c
index 61412e9..8272ef9 100644
--- a/block.c
+++ b/block.c
@@ -3192,10 +3192,7 @@  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
                              BDRV_REQ_COPY_ON_READ);
  }

-/* if no limit is specified in the BlockLimits use a default
- * of 32768 512-byte sectors (16 MiB) per request.
- */
-#define MAX_WRITE_ZEROES_DEFAULT 32768
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768

  static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
      int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
@@ -3206,7 +3203,7 @@  static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
      int ret = 0;

      int max_write_zeroes = bs->bl.max_write_zeroes ?
-                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
+                           bs->bl.max_write_zeroes : INT_MAX;

      while (nb_sectors > 0 && !ret) {
          int num = nb_sectors;
@@ -3242,7 +3239,7 @@  static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
          if (ret == -ENOTSUP) {
              /* Fall back to bounce buffer if write zeroes is unsupported */
              int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
- MAX_WRITE_ZEROES_DEFAULT);
+ MAX_WRITE_ZEROES_BOUNCE_BUFFER);
              num = MIN(num, max_xfer_len);
              iov.iov_len = num * BDRV_SECTOR_SIZE;
              if (iov.iov_base == NULL) {
@@ -5099,11 +5096,6 @@  static void coroutine_fn bdrv_discard_co_entry(void *opaque)
      rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
  }

-/* if no limit is specified in the BlockLimits use a default
- * of 32768 512-byte sectors (16 MiB) per request.
- */
-#define MAX_DISCARD_DEFAULT 32768
-
  int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                   int nb_sectors)
  {
@@ -5128,7 +5120,7 @@  int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
          return 0;
      }

-    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
+    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : INT_MAX;
      while (nb_sectors > 0) {
          int ret;
          int num = nb_sectors;