diff mbox series

[1/2] block: zero data data corruption using prealloc-filter

Message ID 20240711133242.251061-2-andrey.drobyshev@virtuozzo.com
State New
Headers show
Series Fix data corruption within preallocation | expand

Commit Message

Andrey Drobyshev July 11, 2024, 1:32 p.m. UTC
From: "Denis V. Lunev" <den@openvz.org>

We have observed that some clusters in the QCOW2 files are zeroed
while preallocation filter is used.

We are able to trace down the following sequence when prealloc-filter
is used:
    co=0x55e7cbed7680 qcow2_co_pwritev_task()
    co=0x55e7cbed7680 preallocate_co_pwritev_part()
    co=0x55e7cbed7680 handle_write()
    co=0x55e7cbed7680 bdrv_co_do_pwrite_zeroes()
    co=0x55e7cbed7680 raw_do_pwrite_zeroes()
    co=0x7f9edb7fe500 do_fallocate()

Here coroutine 0x55e7cbed7680 is being blocked waiting while coroutine
0x7f9edb7fe500 will finish with fallocate of the file area. OK. It is
time to handle next coroutine, which
    co=0x55e7cbee91b0 qcow2_co_pwritev_task()
    co=0x55e7cbee91b0 preallocate_co_pwritev_part()
    co=0x55e7cbee91b0 handle_write()
    co=0x55e7cbee91b0 bdrv_co_do_pwrite_zeroes()
    co=0x55e7cbee91b0 raw_do_pwrite_zeroes()
    co=0x7f9edb7deb00 do_fallocate()

The trouble comes here. Coroutine 0x55e7cbed7680 has not advanced
file_end yet and coroutine 0x55e7cbee91b0 will start fallocate() for
the same area. This means that if (once fallocate is started inside
0x7f9edb7deb00) original fallocate could end and the real write will
be executed. In that case write() request is handled at the same time
as fallocate().

Normally we should protect s->file_end while it is detected that
preallocation is need. The patch introduces file_end_lock for it to be
protected when run in the coroutine context.

Note: the lock is taken only once it is detected that the preallocation
is really required. This is not a frequent case due to the preallocation
nature thus the patch should not have performance impact.

Originally-by: Denis V. Lunev <den@openvz.org>
Co-authored-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
---
 block/preallocate.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

Comments

Vladimir Sementsov-Ogievskiy July 12, 2024, 12:22 p.m. UTC | #1
On 11.07.24 16:32, Andrey Drobyshev wrote:
> From: "Denis V. Lunev" <den@openvz.org>
> 
> We have observed that some clusters in the QCOW2 files are zeroed
> while preallocation filter is used.
> 
> We are able to trace down the following sequence when prealloc-filter
> is used:
>      co=0x55e7cbed7680 qcow2_co_pwritev_task()
>      co=0x55e7cbed7680 preallocate_co_pwritev_part()
>      co=0x55e7cbed7680 handle_write()
>      co=0x55e7cbed7680 bdrv_co_do_pwrite_zeroes()
>      co=0x55e7cbed7680 raw_do_pwrite_zeroes()
>      co=0x7f9edb7fe500 do_fallocate()
> 
> Here coroutine 0x55e7cbed7680 is being blocked waiting while coroutine
> 0x7f9edb7fe500 will finish with fallocate of the file area. OK. It is
> time to handle next coroutine, which
>      co=0x55e7cbee91b0 qcow2_co_pwritev_task()
>      co=0x55e7cbee91b0 preallocate_co_pwritev_part()
>      co=0x55e7cbee91b0 handle_write()
>      co=0x55e7cbee91b0 bdrv_co_do_pwrite_zeroes()
>      co=0x55e7cbee91b0 raw_do_pwrite_zeroes()
>      co=0x7f9edb7deb00 do_fallocate()
> 
> The trouble comes here. Coroutine 0x55e7cbed7680 has not advanced
> file_end yet and coroutine 0x55e7cbee91b0 will start fallocate() for
> the same area. This means that if (once fallocate is started inside
> 0x7f9edb7deb00) original fallocate could end and the real write will
> be executed. In that case write() request is handled at the same time
> as fallocate().
> 
> Normally we should protect s->file_end while it is detected that
> preallocation is need. The patch introduces file_end_lock for it to be
> protected when run in the coroutine context.
> 
> Note: the lock is taken only once it is detected that the preallocation
> is really required. This is not a frequent case due to the preallocation
> nature thus the patch should not have performance impact.
> 
> Originally-by: Denis V. Lunev <den@openvz.org>
> Co-authored-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> Signed-off-by: Denis V. Lunev <den@openvz.org>
> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> ---
>   block/preallocate.c | 11 +++++++++++
>   1 file changed, 11 insertions(+)
> 
> diff --git a/block/preallocate.c b/block/preallocate.c
> index d215bc5d6d..9cb2c97635 100644
> --- a/block/preallocate.c
> +++ b/block/preallocate.c
> @@ -78,6 +78,8 @@ typedef struct BDRVPreallocateState {
>   
>       /* Gives up the resize permission on children when parents don't need it */
>       QEMUBH *drop_resize_bh;
> +
> +    CoMutex file_end_lock;
>   } BDRVPreallocateState;
>   
>   static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
> @@ -170,6 +172,8 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
>           ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
>               bs->file->bs->supported_zero_flags);
>   
> +    qemu_co_mutex_init(&s->file_end_lock);
> +
>       return 0;
>   }
>   
> @@ -342,6 +346,7 @@ handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
>               return false;
>           }
>   
> +        QEMU_LOCK_GUARD(&s->file_end_lock);
>           if (s->file_end < 0) {
>               s->file_end = s->data_end;
>           }
> @@ -353,6 +358,8 @@ handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
>   
>       /* We have valid s->data_end, and request writes beyond it. */
>   
> +    QEMU_LOCK_GUARD(&s->file_end_lock);
> +
>       s->data_end = end;
>       if (s->zero_start < 0 || !want_merge_zero) {
>           s->zero_start = end;
> @@ -428,6 +435,8 @@ preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
>       BDRVPreallocateState *s = bs->opaque;
>       int ret;
>   
> +    QEMU_LOCK_GUARD(&s->file_end_lock);
> +
>       if (s->data_end >= 0 && offset > s->data_end) {
>           if (s->file_end < 0) {
>               s->file_end = bdrv_co_getlength(bs->file->bs);
> @@ -501,6 +510,8 @@ preallocate_co_getlength(BlockDriverState *bs)
>           return s->data_end;
>       }
>   
> +    QEMU_LOCK_GUARD(&s->file_end_lock);
> +
>       ret = bdrv_co_getlength(bs->file->bs);
>   
>       if (has_prealloc_perms(bs)) {


Hmm, seems preallocate driver not thread-safe neither coroutine-safe. I think co-mutex is good idea. Still, protecting only s->file_end may be not enough

- we do want to keep data_end / zero_start / file_end variables correct
- probably, just make the whole preallocation code a critical section? Maybe, atomic read of variables for fast-path (for writes which doesn't need preallocation)
diff mbox series

Patch

diff --git a/block/preallocate.c b/block/preallocate.c
index d215bc5d6d..9cb2c97635 100644
--- a/block/preallocate.c
+++ b/block/preallocate.c
@@ -78,6 +78,8 @@  typedef struct BDRVPreallocateState {
 
     /* Gives up the resize permission on children when parents don't need it */
     QEMUBH *drop_resize_bh;
+
+    CoMutex file_end_lock;
 } BDRVPreallocateState;
 
 static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
@@ -170,6 +172,8 @@  static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
         ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
             bs->file->bs->supported_zero_flags);
 
+    qemu_co_mutex_init(&s->file_end_lock);
+
     return 0;
 }
 
@@ -342,6 +346,7 @@  handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
             return false;
         }
 
+        QEMU_LOCK_GUARD(&s->file_end_lock);
         if (s->file_end < 0) {
             s->file_end = s->data_end;
         }
@@ -353,6 +358,8 @@  handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
 
     /* We have valid s->data_end, and request writes beyond it. */
 
+    QEMU_LOCK_GUARD(&s->file_end_lock);
+
     s->data_end = end;
     if (s->zero_start < 0 || !want_merge_zero) {
         s->zero_start = end;
@@ -428,6 +435,8 @@  preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
     BDRVPreallocateState *s = bs->opaque;
     int ret;
 
+    QEMU_LOCK_GUARD(&s->file_end_lock);
+
     if (s->data_end >= 0 && offset > s->data_end) {
         if (s->file_end < 0) {
             s->file_end = bdrv_co_getlength(bs->file->bs);
@@ -501,6 +510,8 @@  preallocate_co_getlength(BlockDriverState *bs)
         return s->data_end;
     }
 
+    QEMU_LOCK_GUARD(&s->file_end_lock);
+
     ret = bdrv_co_getlength(bs->file->bs);
 
     if (has_prealloc_perms(bs)) {