diff mbox series

[v3,2/2] qcow2: Skip copy-on-write when allocating a zero cluster

Message ID 0350dd55702eacd701e0aea924b101db7448a517.1599759873.git.berto@igalia.com
State New
Headers show
Series Skip copy-on-write when allocating a zero cluster | expand

Commit Message

Alberto Garcia Sept. 10, 2020, 5:46 p.m. UTC
Since commit c8bb23cbdbe32f5c326365e0a82e1b0e68cdcd8a when a write
request results in a new allocation QEMU first tries to see if the
rest of the cluster outside the written area contains only zeroes.

In that case, instead of doing a normal copy-on-write operation and
writing explicit zero buffers to disk, the code zeroes the whole
cluster efficiently using pwrite_zeroes() with BDRV_REQ_NO_FALLBACK.

This improves performance very significantly but it only happens when
we are writing to an area that was completely unallocated before. Zero
clusters (QCOW2_CLUSTER_ZERO_*) are treated like normal clusters and
are therefore slower to allocate.

This happens because the code uses bdrv_is_allocated_above() rather
bdrv_block_status_above(). The former is not as accurate for this
purpose but it is faster. However in the case of qcow2 the underlying
call does already report zero clusters just fine so there is no reason
why we cannot use that information.

After testing 4KB writes on an image that only contains zero clusters
this patch results in almost five times more IOPS.

Signed-off-by: Alberto Garcia <berto@igalia.com>
---
 include/block/block.h |  2 ++
 block/io.c            | 27 +++++++++++++++++++++++++++
 block/qcow2.c         | 35 +++++++++++++++++++----------------
 3 files changed, 48 insertions(+), 16 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy Sept. 11, 2020, 9:34 a.m. UTC | #1
10.09.2020 20:46, Alberto Garcia wrote:
> Since commit c8bb23cbdbe32f5c326365e0a82e1b0e68cdcd8a when a write
> request results in a new allocation QEMU first tries to see if the
> rest of the cluster outside the written area contains only zeroes.
> 
> In that case, instead of doing a normal copy-on-write operation and
> writing explicit zero buffers to disk, the code zeroes the whole
> cluster efficiently using pwrite_zeroes() with BDRV_REQ_NO_FALLBACK.
> 
> This improves performance very significantly but it only happens when
> we are writing to an area that was completely unallocated before. Zero
> clusters (QCOW2_CLUSTER_ZERO_*) are treated like normal clusters and
> are therefore slower to allocate.
> 
> This happens because the code uses bdrv_is_allocated_above() rather
> bdrv_block_status_above(). The former is not as accurate for this
> purpose but it is faster. However in the case of qcow2 the underlying
> call does already report zero clusters just fine so there is no reason
> why we cannot use that information.
> 
> After testing 4KB writes on an image that only contains zero clusters
> this patch results in almost five times more IOPS.
> 
> Signed-off-by: Alberto Garcia <berto@igalia.com>
> ---
>   include/block/block.h |  2 ++
>   block/io.c            | 27 +++++++++++++++++++++++++++
>   block/qcow2.c         | 35 +++++++++++++++++++----------------
>   3 files changed, 48 insertions(+), 16 deletions(-)
> 
> diff --git a/include/block/block.h b/include/block/block.h
> index 6e36154061..71f5678de7 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -496,6 +496,8 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
>   int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
>                               bool include_base, int64_t offset, int64_t bytes,
>                               int64_t *pnum);
> +int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
> +                                      int64_t bytes);
>   
>   bool bdrv_is_read_only(BlockDriverState *bs);
>   int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
> diff --git a/block/io.c b/block/io.c
> index 1b0ae29610..5950ad87be 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -2557,6 +2557,33 @@ int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                                      offset, bytes, pnum, map, file);
>   }
>   
> +/*
> + * Check @bs (and its backing chain) to see if the range defined
> + * by @offset and @bytes is known to read as zeroes.
> + * Return 1 if that is the case, 0 otherwise and -errno on error.
> + * This test is meant to be fast rather than accurate so returning 0
> + * does not guarantee non-zero data.
> + */
> +int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
> +                                      int64_t bytes)
> +{
> +    int ret;
> +    int64_t pnum = bytes;
> +
> +    if (!bytes) {
> +        return 1;
> +    }
> +
> +    ret = bdrv_common_block_status_above(bs, NULL, false, offset,
> +                                         bytes, &pnum, NULL, NULL);
> +
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
> +}
> +
>   int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
>                                      int64_t bytes, int64_t *pnum)
>   {
> diff --git a/block/qcow2.c b/block/qcow2.c
> index da56b1a4df..68ab6562e3 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2391,26 +2391,26 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>       return false;
>   }
>   
> -static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
> -{
> -    int64_t nr;
> -    return !bytes ||
> -        (!bdrv_is_allocated_above(bs, NULL, false, offset, bytes, &nr) &&
> -         nr == bytes);
> -}
> -
> -static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
> +/*
> + * Return 1 if the COW regions read as zeroes, 0 if not, < 0 on error.
> + * Note that returning 0 does not guarantee non-zero data.
> + */
> +static int is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
>   {
>       /*
>        * This check is designed for optimization shortcut so it must be
>        * efficient.
> -     * Instead of is_zero(), use is_unallocated() as it is faster (but not
> -     * as accurate and can result in false negatives).
> +     * Instead of is_zero(), use bdrv_co_is_zero_fast() as it is
> +     * faster (but not as accurate and can result in false negatives).
>        */
> -    return is_unallocated(bs, m->offset + m->cow_start.offset,
> -                          m->cow_start.nb_bytes) &&
> -           is_unallocated(bs, m->offset + m->cow_end.offset,
> -                          m->cow_end.nb_bytes);
> +    int ret = bdrv_co_is_zero_fast(bs, m->offset + m->cow_start.offset,
> +                                   m->cow_start.nb_bytes);
> +    if (ret <= 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_co_is_zero_fast(bs, m->offset + m->cow_end.offset,
> +                                m->cow_end.nb_bytes);
>   }
>   
>   static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
> @@ -2436,7 +2436,10 @@ static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>               continue;
>           }
>   
> -        if (!is_zero_cow(bs, m)) {
> +        ret = is_zero_cow(bs, m);
> +        if (ret < 0) {
> +            return ret;

It's a common practice to treat block-status errors as "unknown" status and not error-out immediately:

  - really, it's not critical, we can continue assuming non-zero
  - if there are real problems with IO, we'll most probably fail on real read or write operation, and report its status, which seems better for user than block-status error

So, I'd keep existing logic in handle_alloc_space(). And, if you agree and resend, probably good to split this patch into two, one for block.h/io.c and one for qcow2.c (still, I'm OK with it as one patch).

> +        } else if (ret == 0) {
>               continue;
>           }
>   
>
Alberto Garcia Sept. 11, 2020, 10:04 a.m. UTC | #2
On Fri 11 Sep 2020 11:34:37 AM CEST, Vladimir Sementsov-Ogievskiy wrote:
>> -        if (!is_zero_cow(bs, m)) {
>> +        ret = is_zero_cow(bs, m);
>> +        if (ret < 0) {
>> +            return ret;
>
> It's a common practice to treat block-status errors as "unknown"
> status and not error-out immediately:
>
>   - really, it's not critical, we can continue assuming non-zero
>   - if there are real problems with IO, we'll most probably fail on
>   real read or write operation, and report its status, which seems
>   better for user than block-status error

But what's the problem exactly, does this complicate the code too much?
:-?

> So, I'd keep existing logic in handle_alloc_space(). And, if you agree
> and resend, probably good to split this patch into two, one for
> block.h/io.c and one for qcow2.c (still, I'm OK with it as one patch).

Sure, I can split the patch if I have to resend it.

Berto
Vladimir Sementsov-Ogievskiy Sept. 11, 2020, 11:06 a.m. UTC | #3
11.09.2020 13:04, Alberto Garcia wrote:
> On Fri 11 Sep 2020 11:34:37 AM CEST, Vladimir Sementsov-Ogievskiy wrote:
>>> -        if (!is_zero_cow(bs, m)) {
>>> +        ret = is_zero_cow(bs, m);
>>> +        if (ret < 0) {
>>> +            return ret;
>>
>> It's a common practice to treat block-status errors as "unknown"
>> status and not error-out immediately:
>>
>>    - really, it's not critical, we can continue assuming non-zero
>>    - if there are real problems with IO, we'll most probably fail on
>>    real read or write operation, and report its status, which seems
>>    better for user than block-status error
> 
> But what's the problem exactly, does this complicate the code too much?
> :-?

Of course not :) Hmm. OK, I don't know, I'm just used to this practice in block jobs. Patch is correct as is:
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

> 
>> So, I'd keep existing logic in handle_alloc_space(). And, if you agree
>> and resend, probably good to split this patch into two, one for
>> block.h/io.c and one for qcow2.c (still, I'm OK with it as one patch).
> 
> Sure, I can split the patch if I have to resend it.
> 
> Berto
>
diff mbox series

Patch

diff --git a/include/block/block.h b/include/block/block.h
index 6e36154061..71f5678de7 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -496,6 +496,8 @@  int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
 int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
                             bool include_base, int64_t offset, int64_t bytes,
                             int64_t *pnum);
+int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
+                                      int64_t bytes);
 
 bool bdrv_is_read_only(BlockDriverState *bs);
 int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
diff --git a/block/io.c b/block/io.c
index 1b0ae29610..5950ad87be 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2557,6 +2557,33 @@  int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
                                    offset, bytes, pnum, map, file);
 }
 
+/*
+ * Check @bs (and its backing chain) to see if the range defined
+ * by @offset and @bytes is known to read as zeroes.
+ * Return 1 if that is the case, 0 otherwise and -errno on error.
+ * This test is meant to be fast rather than accurate so returning 0
+ * does not guarantee non-zero data.
+ */
+int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
+                                      int64_t bytes)
+{
+    int ret;
+    int64_t pnum = bytes;
+
+    if (!bytes) {
+        return 1;
+    }
+
+    ret = bdrv_common_block_status_above(bs, NULL, false, offset,
+                                         bytes, &pnum, NULL, NULL);
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
+}
+
 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
                                    int64_t bytes, int64_t *pnum)
 {
diff --git a/block/qcow2.c b/block/qcow2.c
index da56b1a4df..68ab6562e3 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2391,26 +2391,26 @@  static bool merge_cow(uint64_t offset, unsigned bytes,
     return false;
 }
 
-static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
-{
-    int64_t nr;
-    return !bytes ||
-        (!bdrv_is_allocated_above(bs, NULL, false, offset, bytes, &nr) &&
-         nr == bytes);
-}
-
-static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
+/*
+ * Return 1 if the COW regions read as zeroes, 0 if not, < 0 on error.
+ * Note that returning 0 does not guarantee non-zero data.
+ */
+static int is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
 {
     /*
      * This check is designed for optimization shortcut so it must be
      * efficient.
-     * Instead of is_zero(), use is_unallocated() as it is faster (but not
-     * as accurate and can result in false negatives).
+     * Instead of is_zero(), use bdrv_co_is_zero_fast() as it is
+     * faster (but not as accurate and can result in false negatives).
      */
-    return is_unallocated(bs, m->offset + m->cow_start.offset,
-                          m->cow_start.nb_bytes) &&
-           is_unallocated(bs, m->offset + m->cow_end.offset,
-                          m->cow_end.nb_bytes);
+    int ret = bdrv_co_is_zero_fast(bs, m->offset + m->cow_start.offset,
+                                   m->cow_start.nb_bytes);
+    if (ret <= 0) {
+        return ret;
+    }
+
+    return bdrv_co_is_zero_fast(bs, m->offset + m->cow_end.offset,
+                                m->cow_end.nb_bytes);
 }
 
 static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
@@ -2436,7 +2436,10 @@  static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
             continue;
         }
 
-        if (!is_zero_cow(bs, m)) {
+        ret = is_zero_cow(bs, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret == 0) {
             continue;
         }