diff mbox series

[4/7] qcow2: make subclusters discardable

Message ID 20231020215622.789260-5-andrey.drobyshev@virtuozzo.com
State New
Headers show
Series qcow2: make subclusters discardable | expand

Commit Message

Andrey Drobyshev Oct. 20, 2023, 9:56 p.m. UTC
This commit makes the discard operation work on the subcluster level
rather than cluster level.  It introduces discard_l2_subclusters()
function and makes use of it in qcow2 discard implementation, much like
it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
operation and free host disk space.

This feature will let us gain additional disk space on guest
TRIM/discard requests, especially when using large enough clusters
(1M, 2M) with subclusters enabled.

Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
---
 block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++--
 block/qcow2.c         |   8 ++--
 2 files changed, 101 insertions(+), 7 deletions(-)

Comments

Jean-Louis Dupond Oct. 27, 2023, 11:10 a.m. UTC | #1
On 20/10/2023 23:56, Andrey Drobyshev wrote:
> This commit makes the discard operation work on the subcluster level
> rather than cluster level.  It introduces discard_l2_subclusters()
> function and makes use of it in qcow2 discard implementation, much like
> it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
> changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
> operation and free host disk space.
>
> This feature will let us gain additional disk space on guest
> TRIM/discard requests, especially when using large enough clusters
> (1M, 2M) with subclusters enabled.
>
> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> ---
>   block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++--
>   block/qcow2.c         |   8 ++--
>   2 files changed, 101 insertions(+), 7 deletions(-)
>
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 7c6fa5524c..cf40f2dc12 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
>       return nb_clusters;
>   }
>   
> +static int coroutine_fn GRAPH_RDLOCK
> +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
> +                       uint64_t nb_subclusters,
> +                       enum qcow2_discard_type type,
> +                       bool full_discard,
> +                       SubClusterRangeInfo *pscri)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t new_l2_bitmap, l2_bitmap_mask;
> +    int ret, sc = offset_to_sc_index(s, offset);
> +    SubClusterRangeInfo scri = { 0 };
> +
> +    if (!pscri) {
> +        ret = get_sc_range_info(bs, offset, nb_subclusters, &scri);
> +        if (ret < 0) {
> +            goto out;
> +        }
> +    } else {
> +        scri = *pscri;
> +    }
> +
> +    l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
> +    new_l2_bitmap = scri.l2_bitmap;
> +    new_l2_bitmap &= ~l2_bitmap_mask;
> +
> +    /*
> +     * If there're no allocated subclusters left, we might as well discard
> +     * the entire cluster.  That way we'd also update the refcount table.
> +     */
> +    if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) {
> +        return discard_in_l2_slice(bs,
> +                                   QEMU_ALIGN_DOWN(offset, s->cluster_size),
> +                                   1, type, full_discard);
> +    }
> +
> +    /*
> +     * Full discard means we fall through to the backing file, thus we only
> +     * need to mark the subclusters as deallocated.
> +     *
> +     * Non-full discard means subclusters should be explicitly marked as
> +     * zeroes.  In this case QCOW2 specification requires the corresponding
> +     * allocation status bits to be unset as well.  If the subclusters are
> +     * deallocated in the first place and there's no backing, the operation
> +     * can be skipped.
> +     */
> +    if (!full_discard &&
> +        (bs->backing || scri.l2_bitmap & l2_bitmap_mask)) {
> +        new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
> +    }
> +
> +    if (scri.l2_bitmap != new_l2_bitmap) {
> +        set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
> +        qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
> +    }
> +
> +    if (s->discard_passthrough[type]) {
> +        qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) +
> +                            offset_into_cluster(s, offset),
> +                            nb_subclusters * s->subcluster_size);
> +    }
> +
> +    ret = 0;
> +out:
> +    qcow2_cache_put(s->l2_table_cache, (void **) &scri.l2_slice);
> +
> +    return ret;
> +}
> +
>   int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
>                             uint64_t bytes, enum qcow2_discard_type type,
>                             bool full_discard)
> @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
>       BDRVQcow2State *s = bs->opaque;
>       uint64_t end_offset = offset + bytes;
>       uint64_t nb_clusters;
> +    unsigned head, tail;
>       int64_t cleared;
>       int ret;
>   
>       /* Caller must pass aligned values, except at image end */
> -    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
> -    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
> +    assert(QEMU_IS_ALIGNED(offset, s->subcluster_size));
> +    assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) ||
>              end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
>   
> -    nb_clusters = size_to_clusters(s, bytes);
> +    head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
> +    offset += head;
> +
> +    tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
> +           end_offset - MAX(offset, start_of_cluster(s, end_offset));
> +    end_offset -= tail;
>   
>       s->cache_discards = true;
>   
> +    if (head) {
> +        ret = discard_l2_subclusters(bs, offset - head,
> +                                     size_to_subclusters(s, head), type,
> +                                     full_discard, NULL);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    }
> +
>       /* Each L2 slice is handled by its own loop iteration */
> +    nb_clusters = size_to_clusters(s, end_offset - offset);
> +
>       while (nb_clusters > 0) {
>           cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
>                                         full_discard);
> @@ -2074,6 +2159,15 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
>           offset += (cleared * s->cluster_size);
>       }
>   
> +    if (tail) {
> +        ret = discard_l2_subclusters(bs, end_offset,
> +                                     size_to_subclusters(s, tail), type,
> +                                     full_discard, NULL);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    }
> +
>       ret = 0;
>   fail:
>       s->cache_discards = false;
> diff --git a/block/qcow2.c b/block/qcow2.c
> index aa01d9e7b5..66961fa59e 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -1966,7 +1966,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
>           bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto);
>       }
>       bs->bl.pwrite_zeroes_alignment = s->subcluster_size;
> -    bs->bl.pdiscard_alignment = s->cluster_size;
> +    bs->bl.pdiscard_alignment = s->subcluster_size;
>   }
>   
>   static int GRAPH_UNLOCKED
> @@ -4102,11 +4102,11 @@ qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
>           return -ENOTSUP;
>       }
>   
> -    if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
> -        assert(bytes < s->cluster_size);
> +    if (!QEMU_IS_ALIGNED(offset | bytes, bs->bl.pdiscard_alignment)) {
> +        assert(bytes < bs->bl.pdiscard_alignment);
>           /* Ignore partial clusters, except for the special case of the
>            * complete partial cluster at the end of an unaligned file */
> -        if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
> +        if (!QEMU_IS_ALIGNED(offset, bs->bl.pdiscard_alignment) ||
>               offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
>               return -ENOTSUP;
>           }


I've checked all the code paths, and as far as I see it nowhere breaks 
the discard_no_unref option.
It's important that we don't introduce new code paths that can make 
holes in the qcow2 image when this option is enabled :)

If you can confirm my conclusion, that would be great.


Thanks
Jean-Louis
Hanna Czenczek Oct. 31, 2023, 4:32 p.m. UTC | #2
On 20.10.23 23:56, Andrey Drobyshev wrote:
> This commit makes the discard operation work on the subcluster level
> rather than cluster level.  It introduces discard_l2_subclusters()
> function and makes use of it in qcow2 discard implementation, much like
> it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
> changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
> operation and free host disk space.
>
> This feature will let us gain additional disk space on guest
> TRIM/discard requests, especially when using large enough clusters
> (1M, 2M) with subclusters enabled.
>
> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> ---
>   block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++--
>   block/qcow2.c         |   8 ++--
>   2 files changed, 101 insertions(+), 7 deletions(-)
>
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 7c6fa5524c..cf40f2dc12 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c

[...]

> +    if (scri.l2_bitmap != new_l2_bitmap) {
> +        set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
> +        qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
> +    }
> +
> +    if (s->discard_passthrough[type]) {
> +        qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) +
> +                            offset_into_cluster(s, offset),
> +                            nb_subclusters * s->subcluster_size);

Are we sure that the cluster is allocated, i.e. that scri.l2_entry & 
L2E_OFFSET_MASK != 0?

As a side note, I guess discard_in_l2_slice() should also use 
qcow2_queue_discard() instead of bdrv_pdiscard() then.

> +    }
> +
> +    ret = 0;
> +out:
> +    qcow2_cache_put(s->l2_table_cache, (void **) &scri.l2_slice);
> +
> +    return ret;
> +}
> +
>   int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
>                             uint64_t bytes, enum qcow2_discard_type type,
>                             bool full_discard)
> @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
>       BDRVQcow2State *s = bs->opaque;
>       uint64_t end_offset = offset + bytes;
>       uint64_t nb_clusters;
> +    unsigned head, tail;
>       int64_t cleared;
>       int ret;
>   
>       /* Caller must pass aligned values, except at image end */
> -    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
> -    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
> +    assert(QEMU_IS_ALIGNED(offset, s->subcluster_size));
> +    assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) ||
>              end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
>   
> -    nb_clusters = size_to_clusters(s, bytes);
> +    head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
> +    offset += head;
> +
> +    tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
> +           end_offset - MAX(offset, start_of_cluster(s, end_offset));
> +    end_offset -= tail;
>   
>       s->cache_discards = true;
>   
> +    if (head) {
> +        ret = discard_l2_subclusters(bs, offset - head,
> +                                     size_to_subclusters(s, head), type,
> +                                     full_discard, NULL);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    }
> +
>       /* Each L2 slice is handled by its own loop iteration */
> +    nb_clusters = size_to_clusters(s, end_offset - offset);
> +
>       while (nb_clusters > 0) {

I think the comment should stay attached to the `while`.

Hanna

>           cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
>                                         full_discard);
Hanna Czenczek Oct. 31, 2023, 4:33 p.m. UTC | #3
(Sorry, opened another reply window, forgot I already had one open...)

On 20.10.23 23:56, Andrey Drobyshev wrote:
> This commit makes the discard operation work on the subcluster level
> rather than cluster level.  It introduces discard_l2_subclusters()
> function and makes use of it in qcow2 discard implementation, much like
> it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
> changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
> operation and free host disk space.
>
> This feature will let us gain additional disk space on guest
> TRIM/discard requests, especially when using large enough clusters
> (1M, 2M) with subclusters enabled.
>
> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> ---
>   block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++--
>   block/qcow2.c         |   8 ++--
>   2 files changed, 101 insertions(+), 7 deletions(-)
>
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 7c6fa5524c..cf40f2dc12 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
>       return nb_clusters;
>   }
>   
> +static int coroutine_fn GRAPH_RDLOCK
> +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
> +                       uint64_t nb_subclusters,
> +                       enum qcow2_discard_type type,
> +                       bool full_discard,
> +                       SubClusterRangeInfo *pscri)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t new_l2_bitmap, l2_bitmap_mask;
> +    int ret, sc = offset_to_sc_index(s, offset);
> +    SubClusterRangeInfo scri = { 0 };
> +
> +    if (!pscri) {
> +        ret = get_sc_range_info(bs, offset, nb_subclusters, &scri);
> +        if (ret < 0) {
> +            goto out;
> +        }
> +    } else {
> +        scri = *pscri;

Allowing to takes this from the caller sounds dangerous, considering we 
need to track who takes care of freeing scri.l2_slice.

> +    }
> +
> +    l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
> +    new_l2_bitmap = scri.l2_bitmap;
> +    new_l2_bitmap &= ~l2_bitmap_mask;
> +
> +    /*
> +     * If there're no allocated subclusters left, we might as well discard
> +     * the entire cluster.  That way we'd also update the refcount table.
> +     */
> +    if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) {

What if there are subclusters in the cluster that are marked as zero, 
outside of the discarded range?  It sounds wrong to apply a discard with 
either full_discard set or cleared to them.

> +        return discard_in_l2_slice(bs,
> +                                   QEMU_ALIGN_DOWN(offset, s->cluster_size),
> +                                   1, type, full_discard);
> +    }
> +
> +    /*
> +     * Full discard means we fall through to the backing file, thus we only
> +     * need to mark the subclusters as deallocated.

I think it also means we need to clear the zero bits.

Hanna

> +     *
> +     * Non-full discard means subclusters should be explicitly marked as
> +     * zeroes.  In this case QCOW2 specification requires the corresponding
> +     * allocation status bits to be unset as well.  If the subclusters are
> +     * deallocated in the first place and there's no backing, the operation
> +     * can be skipped.
> +     */
> +    if (!full_discard &&
> +        (bs->backing || scri.l2_bitmap & l2_bitmap_mask)) {
> +        new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
> +    }
Hanna Czenczek Nov. 3, 2023, 3:53 p.m. UTC | #4
On 20.10.23 23:56, Andrey Drobyshev wrote:
> This commit makes the discard operation work on the subcluster level
> rather than cluster level.  It introduces discard_l2_subclusters()
> function and makes use of it in qcow2 discard implementation, much like
> it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
> changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
> operation and free host disk space.
>
> This feature will let us gain additional disk space on guest
> TRIM/discard requests, especially when using large enough clusters
> (1M, 2M) with subclusters enabled.
>
> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
> ---
>   block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++--
>   block/qcow2.c         |   8 ++--
>   2 files changed, 101 insertions(+), 7 deletions(-)
>
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 7c6fa5524c..cf40f2dc12 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
>       return nb_clusters;
>   }
>   
> +static int coroutine_fn GRAPH_RDLOCK
> +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
> +                       uint64_t nb_subclusters,
> +                       enum qcow2_discard_type type,
> +                       bool full_discard,
> +                       SubClusterRangeInfo *pscri)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t new_l2_bitmap, l2_bitmap_mask;
> +    int ret, sc = offset_to_sc_index(s, offset);
> +    SubClusterRangeInfo scri = { 0 };
> +
> +    if (!pscri) {
> +        ret = get_sc_range_info(bs, offset, nb_subclusters, &scri);
> +        if (ret < 0) {
> +            goto out;
> +        }
> +    } else {
> +        scri = *pscri;
> +    }
> +
> +    l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
> +    new_l2_bitmap = scri.l2_bitmap;
> +    new_l2_bitmap &= ~l2_bitmap_mask;
> +
> +    /*
> +     * If there're no allocated subclusters left, we might as well discard
> +     * the entire cluster.  That way we'd also update the refcount table.
> +     */
> +    if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) {
> +        return discard_in_l2_slice(bs,
> +                                   QEMU_ALIGN_DOWN(offset, s->cluster_size),
> +                                   1, type, full_discard);
> +    }

scri.l2_slice is leaked here.

Hanna
Andrey Drobyshev Nov. 9, 2023, 3:05 p.m. UTC | #5
On 10/31/23 18:32, Hanna Czenczek wrote:
> On 20.10.23 23:56, Andrey Drobyshev wrote:
>> This commit makes the discard operation work on the subcluster level
>> rather than cluster level.  It introduces discard_l2_subclusters()
>> function and makes use of it in qcow2 discard implementation, much like
>> it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
>> changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
>> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
>> operation and free host disk space.
>>
>> This feature will let us gain additional disk space on guest
>> TRIM/discard requests, especially when using large enough clusters
>> (1M, 2M) with subclusters enabled.
>>
>> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
>> ---
>>   block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++--
>>   block/qcow2.c         |   8 ++--
>>   2 files changed, 101 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index 7c6fa5524c..cf40f2dc12 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
> 
> [...]
> 
>> +    if (scri.l2_bitmap != new_l2_bitmap) {
>> +        set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
>> +        qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
>> +    }
>> +
>> +    if (s->discard_passthrough[type]) {
>> +        qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) +
>> +                            offset_into_cluster(s, offset),
>> +                            nb_subclusters * s->subcluster_size);
> 
> Are we sure that the cluster is allocated, i.e. that scri.l2_entry &
> L2E_OFFSET_MASK != 0?
> 

I think it must be illegal to mark the entire cluster as unallocated and
yet mark some of the subclusters as allocated.  So in the case you
describe we should detect it earlier in the '!(new_l2_bitmap &
QCOW_L2_BITMAP_ALL_ALLOC)' condition and fall back to discard_in_l2_slice().

> As a side note, I guess discard_in_l2_slice() should also use
> qcow2_queue_discard() instead of bdrv_pdiscard() then.
> 

Yes, looks like it.  I'll  make it a separate patch then.

>> +    }
>> +
>> +    ret = 0;
>> +out:
>> +    qcow2_cache_put(s->l2_table_cache, (void **) &scri.l2_slice);
>> +
>> +    return ret;
>> +}
>> +
>>   int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
>>                             uint64_t bytes, enum qcow2_discard_type type,
>>                             bool full_discard)
>> @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState
>> *bs, uint64_t offset,
>>       BDRVQcow2State *s = bs->opaque;
>>       uint64_t end_offset = offset + bytes;
>>       uint64_t nb_clusters;
>> +    unsigned head, tail;
>>       int64_t cleared;
>>       int ret;
>>         /* Caller must pass aligned values, except at image end */
>> -    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
>> -    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
>> +    assert(QEMU_IS_ALIGNED(offset, s->subcluster_size));
>> +    assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) ||
>>              end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
>>   -    nb_clusters = size_to_clusters(s, bytes);
>> +    head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
>> +    offset += head;
>> +
>> +    tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
>> +           end_offset - MAX(offset, start_of_cluster(s, end_offset));
>> +    end_offset -= tail;
>>         s->cache_discards = true;
>>   +    if (head) {
>> +        ret = discard_l2_subclusters(bs, offset - head,
>> +                                     size_to_subclusters(s, head), type,
>> +                                     full_discard, NULL);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +    }
>> +
>>       /* Each L2 slice is handled by its own loop iteration */
>> +    nb_clusters = size_to_clusters(s, end_offset - offset);
>> +
>>       while (nb_clusters > 0) {
> 
> I think the comment should stay attached to the `while`.
> 

Agreed.

> Hanna
> 
>>           cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
>>                                         full_discard);
>
Andrey Drobyshev Nov. 10, 2023, 1:26 p.m. UTC | #6
On 10/31/23 18:33, Hanna Czenczek wrote:
> (Sorry, opened another reply window, forgot I already had one open...)
> 
> On 20.10.23 23:56, Andrey Drobyshev wrote:
>> This commit makes the discard operation work on the subcluster level
>> rather than cluster level.  It introduces discard_l2_subclusters()
>> function and makes use of it in qcow2 discard implementation, much like
>> it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
>> changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
>> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
>> operation and free host disk space.
>>
>> This feature will let us gain additional disk space on guest
>> TRIM/discard requests, especially when using large enough clusters
>> (1M, 2M) with subclusters enabled.
>>
>> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
>> ---
>>   block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++--
>>   block/qcow2.c         |   8 ++--
>>   2 files changed, 101 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index 7c6fa5524c..cf40f2dc12 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs,
>> uint64_t offset, uint64_t nb_clusters,
>>       return nb_clusters;
>>   }
>>   +static int coroutine_fn GRAPH_RDLOCK
>> +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
>> +                       uint64_t nb_subclusters,
>> +                       enum qcow2_discard_type type,
>> +                       bool full_discard,
>> +                       SubClusterRangeInfo *pscri)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    uint64_t new_l2_bitmap, l2_bitmap_mask;
>> +    int ret, sc = offset_to_sc_index(s, offset);
>> +    SubClusterRangeInfo scri = { 0 };
>> +
>> +    if (!pscri) {
>> +        ret = get_sc_range_info(bs, offset, nb_subclusters, &scri);
>> +        if (ret < 0) {
>> +            goto out;
>> +        }
>> +    } else {
>> +        scri = *pscri;
> 
> Allowing to takes this from the caller sounds dangerous, considering we
> need to track who takes care of freeing scri.l2_slice.
> 
>> +    }
>> +
>> +    l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc +
>> nb_subclusters);
>> +    new_l2_bitmap = scri.l2_bitmap;
>> +    new_l2_bitmap &= ~l2_bitmap_mask;
>> +
>> +    /*
>> +     * If there're no allocated subclusters left, we might as well
>> discard
>> +     * the entire cluster.  That way we'd also update the refcount
>> table.
>> +     */
>> +    if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) {
> 
> What if there are subclusters in the cluster that are marked as zero,
> outside of the discarded range?  It sounds wrong to apply a discard with
> either full_discard set or cleared to them.
> 

Hmmm, then I think before falling to this path we should either:

1. Make sure full_discard=false.  That is directly implied from what you
said in your other mail: discarding with full_discard=false is
equivalent to zeroizing;
2. Check that no subclusters within this cluster are explicitly marked
as zeroes.
3. Check that either 1) or 2) is true (if ((1) || (2))).

For now I like option 3) better.

>> +        return discard_in_l2_slice(bs,
>> +                                   QEMU_ALIGN_DOWN(offset,
>> s->cluster_size),
>> +                                   1, type, full_discard);
>> +    }
>> +
>> +    /*
>> +     * Full discard means we fall through to the backing file, thus
>> we only
>> +     * need to mark the subclusters as deallocated.
> 
> I think it also means we need to clear the zero bits.
>

Yes, that seems right.

> Hanna
> 
> [...]
Andrey Drobyshev April 16, 2024, 7:56 p.m. UTC | #7
On 10/27/23 14:10, Jean-Louis Dupond wrote:
> [...]
> 
> I've checked all the code paths, and as far as I see it nowhere breaks
> the discard_no_unref option.
> It's important that we don't introduce new code paths that can make
> holes in the qcow2 image when this option is enabled :)
> 
> If you can confirm my conclusion, that would be great.
> 
> 
> Thanks
> Jean-Louis
> 

Hi Jean-Louis,

I've finally got to working on v2 for this series.  However I'm failing
to get a grasp on what this option is supposed to be doing and what are
we trying to avoid here.

Consider this simple example:

# cd build
# ./qemu-img create -f qcow2   unref.qcow2 192K
# ./qemu-img create -f qcow2 nounref.qcow2 192K
# ./qemu-io -c "write 0 192K"   unref.qcow2
# ./qemu-io -c "write 0 192K" nounref.qcow2
#
# strace -fv -e fallocate ./qemu-io -c "discard 64K 64K" unref.qcow2
[pid 887710] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
393216, 65536) = 0
discard 65536/65536 bytes at offset 65536
64 KiB, 1 ops; 00.00 sec (252.123 MiB/sec and 4033.9660 ops/sec)
#
# strace -fv -e fallocate ./qemu-io -c "reopen -o discard-no-unref=on"
-c "discard 64K 64K" nounref.qcow2
# [pid 887789] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
393216, 65536) = 0
discard 65536/65536 bytes at offset 65536
64 KiB, 1 ops; 00.00 sec (345.457 MiB/sec and 5527.3049 ops/sec)
#
# ./qemu-img check unref.qcow2

No errors were found on the image.
2/3 = 66.67% allocated, 50.00% fragmented, 0.00% compressed clusters
Image end offset: 524288
# ./qemu-img check nounref.qcow2
No errors were found on the image.
3/3 = 100.00% allocated, 0.00% fragmented, 0.00% compressed clusters
Image end offset: 524288
#
# ls -la *.qcow2

-rw-r--r-- 1 root root 524288 Apr 16 22:42 nounref.qcow2
-rw-r--r-- 1 root root 524288 Apr 16 22:41 unref.qcow2
# du --block-size=1 *.qcow2
397312  nounref.qcow2
397312  unref.qcow2

I understand that by keeping the L2 entry we achieve that cluster
remains formally allocated, but no matter whether "discard-no-unref"
option is enabled fallocate(FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE) is
being called leaving a hole in the file (e.g. file becomes sparse).
However you say in the comment above that we can't allow making new
holes in the file when this option is enabled.  How does that correlate
and what do we achieve?  And which logic do you think we need to follow
when discarding separate subclusters?

Andrey
Jean-Louis Dupond April 19, 2024, 9:06 a.m. UTC | #8
On 16/04/2024 21:56, Andrey Drobyshev wrote:
> On 10/27/23 14:10, Jean-Louis Dupond wrote:
>> [...]
>>
>> I've checked all the code paths, and as far as I see it nowhere breaks
>> the discard_no_unref option.
>> It's important that we don't introduce new code paths that can make
>> holes in the qcow2 image when this option is enabled :)
>>
>> If you can confirm my conclusion, that would be great.
>>
>>
>> Thanks
>> Jean-Louis
>>
> Hi Jean-Louis,
>
> I've finally got to working on v2 for this series.  However I'm failing
> to get a grasp on what this option is supposed to be doing and what are
> we trying to avoid here.
The discard-no-unref option causes qemu to only zero the blocks/clusters 
that get discarded, but does NOT remove the reference of the cluster.
So the cluster stays allocated/referenced, but is just marked zero.

There are multiple scenario's where you would need this.
First of all when you have a pre-allocated image, you most likely 
created it because you don't want fragmentation.
But if you don't have discard-no-unref enabled, you will end up with a 
fragmented image anyway, because discard will create holes in your 
image, and will be randomly allocated. Ending up with a fragmented image.

Another scenario (and why we implemented it), is that with a sparse 
image, you allocate new blocks at the end of the 'allocation pointer' 
(which points to the first available blocks in your image).
But if you do discards, afaik the pointer is not moved to the freed 
cluster, but still allocates at the end until you reopen the image.
And even then, take you created a hole of 5 free clusters, and you need 
to allocate 4 new clusters, it will use those 5 and leave 1 empty cluster.
But the next allocation needs 2 clusters, it will jump to the next free 
space with at least 2 clusters. Leaving that 1 cluster unallocated.
And this caused us to have 'sparse' images of 110GB for 100GB images for 
example. Just because the qcow2 images was full of small empty clusters 
completely fragmented.
>
> Consider this simple example:
>
> # cd build
> # ./qemu-img create -f qcow2   unref.qcow2 192K
> # ./qemu-img create -f qcow2 nounref.qcow2 192K
> # ./qemu-io -c "write 0 192K"   unref.qcow2
> # ./qemu-io -c "write 0 192K" nounref.qcow2
> #
> # strace -fv -e fallocate ./qemu-io -c "discard 64K 64K" unref.qcow2
> [pid 887710] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
> 393216, 65536) = 0
> discard 65536/65536 bytes at offset 65536
> 64 KiB, 1 ops; 00.00 sec (252.123 MiB/sec and 4033.9660 ops/sec)
> #
> # strace -fv -e fallocate ./qemu-io -c "reopen -o discard-no-unref=on"
> -c "discard 64K 64K" nounref.qcow2
> # [pid 887789] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
> 393216, 65536) = 0
> discard 65536/65536 bytes at offset 65536
> 64 KiB, 1 ops; 00.00 sec (345.457 MiB/sec and 5527.3049 ops/sec)
> #
> # ./qemu-img check unref.qcow2
>
> No errors were found on the image.
> 2/3 = 66.67% allocated, 50.00% fragmented, 0.00% compressed clusters
> Image end offset: 524288
> # ./qemu-img check nounref.qcow2
> No errors were found on the image.
> 3/3 = 100.00% allocated, 0.00% fragmented, 0.00% compressed clusters
> Image end offset: 524288
> #
> # ls -la *.qcow2
>
> -rw-r--r-- 1 root root 524288 Apr 16 22:42 nounref.qcow2
> -rw-r--r-- 1 root root 524288 Apr 16 22:41 unref.qcow2
> # du --block-size=1 *.qcow2
> 397312  nounref.qcow2
> 397312  unref.qcow2
>
> I understand that by keeping the L2 entry we achieve that cluster
> remains formally allocated, but no matter whether "discard-no-unref"
> option is enabled fallocate(FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE) is
> being called leaving a hole in the file (e.g. file becomes sparse).
> However you say in the comment above that we can't allow making new
> holes in the file when this option is enabled.  How does that correlate
> and what do we achieve?  And which logic do you think we need to follow
> when discarding separate subclusters?
>
> Andrey
diff mbox series

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 7c6fa5524c..cf40f2dc12 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -2042,6 +2042,74 @@  discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
     return nb_clusters;
 }
 
+static int coroutine_fn GRAPH_RDLOCK
+discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
+                       uint64_t nb_subclusters,
+                       enum qcow2_discard_type type,
+                       bool full_discard,
+                       SubClusterRangeInfo *pscri)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t new_l2_bitmap, l2_bitmap_mask;
+    int ret, sc = offset_to_sc_index(s, offset);
+    SubClusterRangeInfo scri = { 0 };
+
+    if (!pscri) {
+        ret = get_sc_range_info(bs, offset, nb_subclusters, &scri);
+        if (ret < 0) {
+            goto out;
+        }
+    } else {
+        scri = *pscri;
+    }
+
+    l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
+    new_l2_bitmap = scri.l2_bitmap;
+    new_l2_bitmap &= ~l2_bitmap_mask;
+
+    /*
+     * If there're no allocated subclusters left, we might as well discard
+     * the entire cluster.  That way we'd also update the refcount table.
+     */
+    if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) {
+        return discard_in_l2_slice(bs,
+                                   QEMU_ALIGN_DOWN(offset, s->cluster_size),
+                                   1, type, full_discard);
+    }
+
+    /*
+     * Full discard means we fall through to the backing file, thus we only
+     * need to mark the subclusters as deallocated.
+     *
+     * Non-full discard means subclusters should be explicitly marked as
+     * zeroes.  In this case QCOW2 specification requires the corresponding
+     * allocation status bits to be unset as well.  If the subclusters are
+     * deallocated in the first place and there's no backing, the operation
+     * can be skipped.
+     */
+    if (!full_discard &&
+        (bs->backing || scri.l2_bitmap & l2_bitmap_mask)) {
+        new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
+    }
+
+    if (scri.l2_bitmap != new_l2_bitmap) {
+        set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
+    }
+
+    if (s->discard_passthrough[type]) {
+        qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) +
+                            offset_into_cluster(s, offset),
+                            nb_subclusters * s->subcluster_size);
+    }
+
+    ret = 0;
+out:
+    qcow2_cache_put(s->l2_table_cache, (void **) &scri.l2_slice);
+
+    return ret;
+}
+
 int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
                           uint64_t bytes, enum qcow2_discard_type type,
                           bool full_discard)
@@ -2049,19 +2117,36 @@  int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
     BDRVQcow2State *s = bs->opaque;
     uint64_t end_offset = offset + bytes;
     uint64_t nb_clusters;
+    unsigned head, tail;
     int64_t cleared;
     int ret;
 
     /* Caller must pass aligned values, except at image end */
-    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
-    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+    assert(QEMU_IS_ALIGNED(offset, s->subcluster_size));
+    assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) ||
            end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
 
-    nb_clusters = size_to_clusters(s, bytes);
+    head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
+    offset += head;
+
+    tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
+           end_offset - MAX(offset, start_of_cluster(s, end_offset));
+    end_offset -= tail;
 
     s->cache_discards = true;
 
+    if (head) {
+        ret = discard_l2_subclusters(bs, offset - head,
+                                     size_to_subclusters(s, head), type,
+                                     full_discard, NULL);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
     /* Each L2 slice is handled by its own loop iteration */
+    nb_clusters = size_to_clusters(s, end_offset - offset);
+
     while (nb_clusters > 0) {
         cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
                                       full_discard);
@@ -2074,6 +2159,15 @@  int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
         offset += (cleared * s->cluster_size);
     }
 
+    if (tail) {
+        ret = discard_l2_subclusters(bs, end_offset,
+                                     size_to_subclusters(s, tail), type,
+                                     full_discard, NULL);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
     ret = 0;
 fail:
     s->cache_discards = false;
diff --git a/block/qcow2.c b/block/qcow2.c
index aa01d9e7b5..66961fa59e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1966,7 +1966,7 @@  static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
         bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto);
     }
     bs->bl.pwrite_zeroes_alignment = s->subcluster_size;
-    bs->bl.pdiscard_alignment = s->cluster_size;
+    bs->bl.pdiscard_alignment = s->subcluster_size;
 }
 
 static int GRAPH_UNLOCKED
@@ -4102,11 +4102,11 @@  qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
         return -ENOTSUP;
     }
 
-    if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
-        assert(bytes < s->cluster_size);
+    if (!QEMU_IS_ALIGNED(offset | bytes, bs->bl.pdiscard_alignment)) {
+        assert(bytes < bs->bl.pdiscard_alignment);
         /* Ignore partial clusters, except for the special case of the
          * complete partial cluster at the end of an unaligned file */
-        if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
+        if (!QEMU_IS_ALIGNED(offset, bs->bl.pdiscard_alignment) ||
             offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
             return -ENOTSUP;
         }