Message ID | 20231020215622.789260-5-andrey.drobyshev@virtuozzo.com |
---|---|
State | New |
Headers | show |
Series | qcow2: make subclusters discardable | expand |
On 20/10/2023 23:56, Andrey Drobyshev wrote: > This commit makes the discard operation work on the subcluster level > rather than cluster level. It introduces discard_l2_subclusters() > function and makes use of it in qcow2 discard implementation, much like > it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also > changes the qcow2 driver pdiscard_alignment to subcluster_size. That > way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) > operation and free host disk space. > > This feature will let us gain additional disk space on guest > TRIM/discard requests, especially when using large enough clusters > (1M, 2M) with subclusters enabled. > > Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> > --- > block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++-- > block/qcow2.c | 8 ++-- > 2 files changed, 101 insertions(+), 7 deletions(-) > > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c > index 7c6fa5524c..cf40f2dc12 100644 > --- a/block/qcow2-cluster.c > +++ b/block/qcow2-cluster.c > @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, > return nb_clusters; > } > > +static int coroutine_fn GRAPH_RDLOCK > +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, > + uint64_t nb_subclusters, > + enum qcow2_discard_type type, > + bool full_discard, > + SubClusterRangeInfo *pscri) > +{ > + BDRVQcow2State *s = bs->opaque; > + uint64_t new_l2_bitmap, l2_bitmap_mask; > + int ret, sc = offset_to_sc_index(s, offset); > + SubClusterRangeInfo scri = { 0 }; > + > + if (!pscri) { > + ret = get_sc_range_info(bs, offset, nb_subclusters, &scri); > + if (ret < 0) { > + goto out; > + } > + } else { > + scri = *pscri; > + } > + > + l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); > + new_l2_bitmap = scri.l2_bitmap; > + new_l2_bitmap &= ~l2_bitmap_mask; > + > + /* > + * If there're no allocated subclusters left, we might as well discard > + * the entire cluster. That way we'd also update the refcount table. > + */ > + if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) { > + return discard_in_l2_slice(bs, > + QEMU_ALIGN_DOWN(offset, s->cluster_size), > + 1, type, full_discard); > + } > + > + /* > + * Full discard means we fall through to the backing file, thus we only > + * need to mark the subclusters as deallocated. > + * > + * Non-full discard means subclusters should be explicitly marked as > + * zeroes. In this case QCOW2 specification requires the corresponding > + * allocation status bits to be unset as well. If the subclusters are > + * deallocated in the first place and there's no backing, the operation > + * can be skipped. > + */ > + if (!full_discard && > + (bs->backing || scri.l2_bitmap & l2_bitmap_mask)) { > + new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); > + } > + > + if (scri.l2_bitmap != new_l2_bitmap) { > + set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); > + qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); > + } > + > + if (s->discard_passthrough[type]) { > + qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) + > + offset_into_cluster(s, offset), > + nb_subclusters * s->subcluster_size); > + } > + > + ret = 0; > +out: > + qcow2_cache_put(s->l2_table_cache, (void **) &scri.l2_slice); > + > + return ret; > +} > + > int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, > uint64_t bytes, enum qcow2_discard_type type, > bool full_discard) > @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, > BDRVQcow2State *s = bs->opaque; > uint64_t end_offset = offset + bytes; > uint64_t nb_clusters; > + unsigned head, tail; > int64_t cleared; > int ret; > > /* Caller must pass aligned values, except at image end */ > - assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); > - assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || > + assert(QEMU_IS_ALIGNED(offset, s->subcluster_size)); > + assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) || > end_offset == bs->total_sectors << BDRV_SECTOR_BITS); > > - nb_clusters = size_to_clusters(s, bytes); > + head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset; > + offset += head; > + > + tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 : > + end_offset - MAX(offset, start_of_cluster(s, end_offset)); > + end_offset -= tail; > > s->cache_discards = true; > > + if (head) { > + ret = discard_l2_subclusters(bs, offset - head, > + size_to_subclusters(s, head), type, > + full_discard, NULL); > + if (ret < 0) { > + goto fail; > + } > + } > + > /* Each L2 slice is handled by its own loop iteration */ > + nb_clusters = size_to_clusters(s, end_offset - offset); > + > while (nb_clusters > 0) { > cleared = discard_in_l2_slice(bs, offset, nb_clusters, type, > full_discard); > @@ -2074,6 +2159,15 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, > offset += (cleared * s->cluster_size); > } > > + if (tail) { > + ret = discard_l2_subclusters(bs, end_offset, > + size_to_subclusters(s, tail), type, > + full_discard, NULL); > + if (ret < 0) { > + goto fail; > + } > + } > + > ret = 0; > fail: > s->cache_discards = false; > diff --git a/block/qcow2.c b/block/qcow2.c > index aa01d9e7b5..66961fa59e 100644 > --- a/block/qcow2.c > +++ b/block/qcow2.c > @@ -1966,7 +1966,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) > bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto); > } > bs->bl.pwrite_zeroes_alignment = s->subcluster_size; > - bs->bl.pdiscard_alignment = s->cluster_size; > + bs->bl.pdiscard_alignment = s->subcluster_size; > } > > static int GRAPH_UNLOCKED > @@ -4102,11 +4102,11 @@ qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) > return -ENOTSUP; > } > > - if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) { > - assert(bytes < s->cluster_size); > + if (!QEMU_IS_ALIGNED(offset | bytes, bs->bl.pdiscard_alignment)) { > + assert(bytes < bs->bl.pdiscard_alignment); > /* Ignore partial clusters, except for the special case of the > * complete partial cluster at the end of an unaligned file */ > - if (!QEMU_IS_ALIGNED(offset, s->cluster_size) || > + if (!QEMU_IS_ALIGNED(offset, bs->bl.pdiscard_alignment) || > offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) { > return -ENOTSUP; > } I've checked all the code paths, and as far as I see it nowhere breaks the discard_no_unref option. It's important that we don't introduce new code paths that can make holes in the qcow2 image when this option is enabled :) If you can confirm my conclusion, that would be great. Thanks Jean-Louis
On 20.10.23 23:56, Andrey Drobyshev wrote: > This commit makes the discard operation work on the subcluster level > rather than cluster level. It introduces discard_l2_subclusters() > function and makes use of it in qcow2 discard implementation, much like > it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also > changes the qcow2 driver pdiscard_alignment to subcluster_size. That > way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) > operation and free host disk space. > > This feature will let us gain additional disk space on guest > TRIM/discard requests, especially when using large enough clusters > (1M, 2M) with subclusters enabled. > > Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> > --- > block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++-- > block/qcow2.c | 8 ++-- > 2 files changed, 101 insertions(+), 7 deletions(-) > > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c > index 7c6fa5524c..cf40f2dc12 100644 > --- a/block/qcow2-cluster.c > +++ b/block/qcow2-cluster.c [...] > + if (scri.l2_bitmap != new_l2_bitmap) { > + set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); > + qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); > + } > + > + if (s->discard_passthrough[type]) { > + qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) + > + offset_into_cluster(s, offset), > + nb_subclusters * s->subcluster_size); Are we sure that the cluster is allocated, i.e. that scri.l2_entry & L2E_OFFSET_MASK != 0? As a side note, I guess discard_in_l2_slice() should also use qcow2_queue_discard() instead of bdrv_pdiscard() then. > + } > + > + ret = 0; > +out: > + qcow2_cache_put(s->l2_table_cache, (void **) &scri.l2_slice); > + > + return ret; > +} > + > int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, > uint64_t bytes, enum qcow2_discard_type type, > bool full_discard) > @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, > BDRVQcow2State *s = bs->opaque; > uint64_t end_offset = offset + bytes; > uint64_t nb_clusters; > + unsigned head, tail; > int64_t cleared; > int ret; > > /* Caller must pass aligned values, except at image end */ > - assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); > - assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || > + assert(QEMU_IS_ALIGNED(offset, s->subcluster_size)); > + assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) || > end_offset == bs->total_sectors << BDRV_SECTOR_BITS); > > - nb_clusters = size_to_clusters(s, bytes); > + head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset; > + offset += head; > + > + tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 : > + end_offset - MAX(offset, start_of_cluster(s, end_offset)); > + end_offset -= tail; > > s->cache_discards = true; > > + if (head) { > + ret = discard_l2_subclusters(bs, offset - head, > + size_to_subclusters(s, head), type, > + full_discard, NULL); > + if (ret < 0) { > + goto fail; > + } > + } > + > /* Each L2 slice is handled by its own loop iteration */ > + nb_clusters = size_to_clusters(s, end_offset - offset); > + > while (nb_clusters > 0) { I think the comment should stay attached to the `while`. Hanna > cleared = discard_in_l2_slice(bs, offset, nb_clusters, type, > full_discard);
(Sorry, opened another reply window, forgot I already had one open...) On 20.10.23 23:56, Andrey Drobyshev wrote: > This commit makes the discard operation work on the subcluster level > rather than cluster level. It introduces discard_l2_subclusters() > function and makes use of it in qcow2 discard implementation, much like > it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also > changes the qcow2 driver pdiscard_alignment to subcluster_size. That > way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) > operation and free host disk space. > > This feature will let us gain additional disk space on guest > TRIM/discard requests, especially when using large enough clusters > (1M, 2M) with subclusters enabled. > > Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> > --- > block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++-- > block/qcow2.c | 8 ++-- > 2 files changed, 101 insertions(+), 7 deletions(-) > > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c > index 7c6fa5524c..cf40f2dc12 100644 > --- a/block/qcow2-cluster.c > +++ b/block/qcow2-cluster.c > @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, > return nb_clusters; > } > > +static int coroutine_fn GRAPH_RDLOCK > +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, > + uint64_t nb_subclusters, > + enum qcow2_discard_type type, > + bool full_discard, > + SubClusterRangeInfo *pscri) > +{ > + BDRVQcow2State *s = bs->opaque; > + uint64_t new_l2_bitmap, l2_bitmap_mask; > + int ret, sc = offset_to_sc_index(s, offset); > + SubClusterRangeInfo scri = { 0 }; > + > + if (!pscri) { > + ret = get_sc_range_info(bs, offset, nb_subclusters, &scri); > + if (ret < 0) { > + goto out; > + } > + } else { > + scri = *pscri; Allowing to takes this from the caller sounds dangerous, considering we need to track who takes care of freeing scri.l2_slice. > + } > + > + l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); > + new_l2_bitmap = scri.l2_bitmap; > + new_l2_bitmap &= ~l2_bitmap_mask; > + > + /* > + * If there're no allocated subclusters left, we might as well discard > + * the entire cluster. That way we'd also update the refcount table. > + */ > + if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) { What if there are subclusters in the cluster that are marked as zero, outside of the discarded range? It sounds wrong to apply a discard with either full_discard set or cleared to them. > + return discard_in_l2_slice(bs, > + QEMU_ALIGN_DOWN(offset, s->cluster_size), > + 1, type, full_discard); > + } > + > + /* > + * Full discard means we fall through to the backing file, thus we only > + * need to mark the subclusters as deallocated. I think it also means we need to clear the zero bits. Hanna > + * > + * Non-full discard means subclusters should be explicitly marked as > + * zeroes. In this case QCOW2 specification requires the corresponding > + * allocation status bits to be unset as well. If the subclusters are > + * deallocated in the first place and there's no backing, the operation > + * can be skipped. > + */ > + if (!full_discard && > + (bs->backing || scri.l2_bitmap & l2_bitmap_mask)) { > + new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); > + }
On 20.10.23 23:56, Andrey Drobyshev wrote: > This commit makes the discard operation work on the subcluster level > rather than cluster level. It introduces discard_l2_subclusters() > function and makes use of it in qcow2 discard implementation, much like > it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also > changes the qcow2 driver pdiscard_alignment to subcluster_size. That > way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) > operation and free host disk space. > > This feature will let us gain additional disk space on guest > TRIM/discard requests, especially when using large enough clusters > (1M, 2M) with subclusters enabled. > > Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> > --- > block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++-- > block/qcow2.c | 8 ++-- > 2 files changed, 101 insertions(+), 7 deletions(-) > > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c > index 7c6fa5524c..cf40f2dc12 100644 > --- a/block/qcow2-cluster.c > +++ b/block/qcow2-cluster.c > @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, > return nb_clusters; > } > > +static int coroutine_fn GRAPH_RDLOCK > +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, > + uint64_t nb_subclusters, > + enum qcow2_discard_type type, > + bool full_discard, > + SubClusterRangeInfo *pscri) > +{ > + BDRVQcow2State *s = bs->opaque; > + uint64_t new_l2_bitmap, l2_bitmap_mask; > + int ret, sc = offset_to_sc_index(s, offset); > + SubClusterRangeInfo scri = { 0 }; > + > + if (!pscri) { > + ret = get_sc_range_info(bs, offset, nb_subclusters, &scri); > + if (ret < 0) { > + goto out; > + } > + } else { > + scri = *pscri; > + } > + > + l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); > + new_l2_bitmap = scri.l2_bitmap; > + new_l2_bitmap &= ~l2_bitmap_mask; > + > + /* > + * If there're no allocated subclusters left, we might as well discard > + * the entire cluster. That way we'd also update the refcount table. > + */ > + if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) { > + return discard_in_l2_slice(bs, > + QEMU_ALIGN_DOWN(offset, s->cluster_size), > + 1, type, full_discard); > + } scri.l2_slice is leaked here. Hanna
On 10/31/23 18:32, Hanna Czenczek wrote: > On 20.10.23 23:56, Andrey Drobyshev wrote: >> This commit makes the discard operation work on the subcluster level >> rather than cluster level. It introduces discard_l2_subclusters() >> function and makes use of it in qcow2 discard implementation, much like >> it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also >> changes the qcow2 driver pdiscard_alignment to subcluster_size. That >> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) >> operation and free host disk space. >> >> This feature will let us gain additional disk space on guest >> TRIM/discard requests, especially when using large enough clusters >> (1M, 2M) with subclusters enabled. >> >> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> >> --- >> block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++-- >> block/qcow2.c | 8 ++-- >> 2 files changed, 101 insertions(+), 7 deletions(-) >> >> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c >> index 7c6fa5524c..cf40f2dc12 100644 >> --- a/block/qcow2-cluster.c >> +++ b/block/qcow2-cluster.c > > [...] > >> + if (scri.l2_bitmap != new_l2_bitmap) { >> + set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); >> + qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); >> + } >> + >> + if (s->discard_passthrough[type]) { >> + qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) + >> + offset_into_cluster(s, offset), >> + nb_subclusters * s->subcluster_size); > > Are we sure that the cluster is allocated, i.e. that scri.l2_entry & > L2E_OFFSET_MASK != 0? > I think it must be illegal to mark the entire cluster as unallocated and yet mark some of the subclusters as allocated. So in the case you describe we should detect it earlier in the '!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)' condition and fall back to discard_in_l2_slice(). > As a side note, I guess discard_in_l2_slice() should also use > qcow2_queue_discard() instead of bdrv_pdiscard() then. > Yes, looks like it. I'll make it a separate patch then. >> + } >> + >> + ret = 0; >> +out: >> + qcow2_cache_put(s->l2_table_cache, (void **) &scri.l2_slice); >> + >> + return ret; >> +} >> + >> int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, >> uint64_t bytes, enum qcow2_discard_type type, >> bool full_discard) >> @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState >> *bs, uint64_t offset, >> BDRVQcow2State *s = bs->opaque; >> uint64_t end_offset = offset + bytes; >> uint64_t nb_clusters; >> + unsigned head, tail; >> int64_t cleared; >> int ret; >> /* Caller must pass aligned values, except at image end */ >> - assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); >> - assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || >> + assert(QEMU_IS_ALIGNED(offset, s->subcluster_size)); >> + assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) || >> end_offset == bs->total_sectors << BDRV_SECTOR_BITS); >> - nb_clusters = size_to_clusters(s, bytes); >> + head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset; >> + offset += head; >> + >> + tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 : >> + end_offset - MAX(offset, start_of_cluster(s, end_offset)); >> + end_offset -= tail; >> s->cache_discards = true; >> + if (head) { >> + ret = discard_l2_subclusters(bs, offset - head, >> + size_to_subclusters(s, head), type, >> + full_discard, NULL); >> + if (ret < 0) { >> + goto fail; >> + } >> + } >> + >> /* Each L2 slice is handled by its own loop iteration */ >> + nb_clusters = size_to_clusters(s, end_offset - offset); >> + >> while (nb_clusters > 0) { > > I think the comment should stay attached to the `while`. > Agreed. > Hanna > >> cleared = discard_in_l2_slice(bs, offset, nb_clusters, type, >> full_discard); >
On 10/31/23 18:33, Hanna Czenczek wrote: > (Sorry, opened another reply window, forgot I already had one open...) > > On 20.10.23 23:56, Andrey Drobyshev wrote: >> This commit makes the discard operation work on the subcluster level >> rather than cluster level. It introduces discard_l2_subclusters() >> function and makes use of it in qcow2 discard implementation, much like >> it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also >> changes the qcow2 driver pdiscard_alignment to subcluster_size. That >> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) >> operation and free host disk space. >> >> This feature will let us gain additional disk space on guest >> TRIM/discard requests, especially when using large enough clusters >> (1M, 2M) with subclusters enabled. >> >> Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> >> --- >> block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++-- >> block/qcow2.c | 8 ++-- >> 2 files changed, 101 insertions(+), 7 deletions(-) >> >> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c >> index 7c6fa5524c..cf40f2dc12 100644 >> --- a/block/qcow2-cluster.c >> +++ b/block/qcow2-cluster.c >> @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, >> uint64_t offset, uint64_t nb_clusters, >> return nb_clusters; >> } >> +static int coroutine_fn GRAPH_RDLOCK >> +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, >> + uint64_t nb_subclusters, >> + enum qcow2_discard_type type, >> + bool full_discard, >> + SubClusterRangeInfo *pscri) >> +{ >> + BDRVQcow2State *s = bs->opaque; >> + uint64_t new_l2_bitmap, l2_bitmap_mask; >> + int ret, sc = offset_to_sc_index(s, offset); >> + SubClusterRangeInfo scri = { 0 }; >> + >> + if (!pscri) { >> + ret = get_sc_range_info(bs, offset, nb_subclusters, &scri); >> + if (ret < 0) { >> + goto out; >> + } >> + } else { >> + scri = *pscri; > > Allowing to takes this from the caller sounds dangerous, considering we > need to track who takes care of freeing scri.l2_slice. > >> + } >> + >> + l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + >> nb_subclusters); >> + new_l2_bitmap = scri.l2_bitmap; >> + new_l2_bitmap &= ~l2_bitmap_mask; >> + >> + /* >> + * If there're no allocated subclusters left, we might as well >> discard >> + * the entire cluster. That way we'd also update the refcount >> table. >> + */ >> + if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) { > > What if there are subclusters in the cluster that are marked as zero, > outside of the discarded range? It sounds wrong to apply a discard with > either full_discard set or cleared to them. > Hmmm, then I think before falling to this path we should either: 1. Make sure full_discard=false. That is directly implied from what you said in your other mail: discarding with full_discard=false is equivalent to zeroizing; 2. Check that no subclusters within this cluster are explicitly marked as zeroes. 3. Check that either 1) or 2) is true (if ((1) || (2))). For now I like option 3) better. >> + return discard_in_l2_slice(bs, >> + QEMU_ALIGN_DOWN(offset, >> s->cluster_size), >> + 1, type, full_discard); >> + } >> + >> + /* >> + * Full discard means we fall through to the backing file, thus >> we only >> + * need to mark the subclusters as deallocated. > > I think it also means we need to clear the zero bits. > Yes, that seems right. > Hanna > > [...]
On 10/27/23 14:10, Jean-Louis Dupond wrote: > [...] > > I've checked all the code paths, and as far as I see it nowhere breaks > the discard_no_unref option. > It's important that we don't introduce new code paths that can make > holes in the qcow2 image when this option is enabled :) > > If you can confirm my conclusion, that would be great. > > > Thanks > Jean-Louis > Hi Jean-Louis, I've finally got to working on v2 for this series. However I'm failing to get a grasp on what this option is supposed to be doing and what are we trying to avoid here. Consider this simple example: # cd build # ./qemu-img create -f qcow2 unref.qcow2 192K # ./qemu-img create -f qcow2 nounref.qcow2 192K # ./qemu-io -c "write 0 192K" unref.qcow2 # ./qemu-io -c "write 0 192K" nounref.qcow2 # # strace -fv -e fallocate ./qemu-io -c "discard 64K 64K" unref.qcow2 [pid 887710] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 393216, 65536) = 0 discard 65536/65536 bytes at offset 65536 64 KiB, 1 ops; 00.00 sec (252.123 MiB/sec and 4033.9660 ops/sec) # # strace -fv -e fallocate ./qemu-io -c "reopen -o discard-no-unref=on" -c "discard 64K 64K" nounref.qcow2 # [pid 887789] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 393216, 65536) = 0 discard 65536/65536 bytes at offset 65536 64 KiB, 1 ops; 00.00 sec (345.457 MiB/sec and 5527.3049 ops/sec) # # ./qemu-img check unref.qcow2 No errors were found on the image. 2/3 = 66.67% allocated, 50.00% fragmented, 0.00% compressed clusters Image end offset: 524288 # ./qemu-img check nounref.qcow2 No errors were found on the image. 3/3 = 100.00% allocated, 0.00% fragmented, 0.00% compressed clusters Image end offset: 524288 # # ls -la *.qcow2 -rw-r--r-- 1 root root 524288 Apr 16 22:42 nounref.qcow2 -rw-r--r-- 1 root root 524288 Apr 16 22:41 unref.qcow2 # du --block-size=1 *.qcow2 397312 nounref.qcow2 397312 unref.qcow2 I understand that by keeping the L2 entry we achieve that cluster remains formally allocated, but no matter whether "discard-no-unref" option is enabled fallocate(FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE) is being called leaving a hole in the file (e.g. file becomes sparse). However you say in the comment above that we can't allow making new holes in the file when this option is enabled. How does that correlate and what do we achieve? And which logic do you think we need to follow when discarding separate subclusters? Andrey
On 16/04/2024 21:56, Andrey Drobyshev wrote: > On 10/27/23 14:10, Jean-Louis Dupond wrote: >> [...] >> >> I've checked all the code paths, and as far as I see it nowhere breaks >> the discard_no_unref option. >> It's important that we don't introduce new code paths that can make >> holes in the qcow2 image when this option is enabled :) >> >> If you can confirm my conclusion, that would be great. >> >> >> Thanks >> Jean-Louis >> > Hi Jean-Louis, > > I've finally got to working on v2 for this series. However I'm failing > to get a grasp on what this option is supposed to be doing and what are > we trying to avoid here. The discard-no-unref option causes qemu to only zero the blocks/clusters that get discarded, but does NOT remove the reference of the cluster. So the cluster stays allocated/referenced, but is just marked zero. There are multiple scenario's where you would need this. First of all when you have a pre-allocated image, you most likely created it because you don't want fragmentation. But if you don't have discard-no-unref enabled, you will end up with a fragmented image anyway, because discard will create holes in your image, and will be randomly allocated. Ending up with a fragmented image. Another scenario (and why we implemented it), is that with a sparse image, you allocate new blocks at the end of the 'allocation pointer' (which points to the first available blocks in your image). But if you do discards, afaik the pointer is not moved to the freed cluster, but still allocates at the end until you reopen the image. And even then, take you created a hole of 5 free clusters, and you need to allocate 4 new clusters, it will use those 5 and leave 1 empty cluster. But the next allocation needs 2 clusters, it will jump to the next free space with at least 2 clusters. Leaving that 1 cluster unallocated. And this caused us to have 'sparse' images of 110GB for 100GB images for example. Just because the qcow2 images was full of small empty clusters completely fragmented. > > Consider this simple example: > > # cd build > # ./qemu-img create -f qcow2 unref.qcow2 192K > # ./qemu-img create -f qcow2 nounref.qcow2 192K > # ./qemu-io -c "write 0 192K" unref.qcow2 > # ./qemu-io -c "write 0 192K" nounref.qcow2 > # > # strace -fv -e fallocate ./qemu-io -c "discard 64K 64K" unref.qcow2 > [pid 887710] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, > 393216, 65536) = 0 > discard 65536/65536 bytes at offset 65536 > 64 KiB, 1 ops; 00.00 sec (252.123 MiB/sec and 4033.9660 ops/sec) > # > # strace -fv -e fallocate ./qemu-io -c "reopen -o discard-no-unref=on" > -c "discard 64K 64K" nounref.qcow2 > # [pid 887789] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, > 393216, 65536) = 0 > discard 65536/65536 bytes at offset 65536 > 64 KiB, 1 ops; 00.00 sec (345.457 MiB/sec and 5527.3049 ops/sec) > # > # ./qemu-img check unref.qcow2 > > No errors were found on the image. > 2/3 = 66.67% allocated, 50.00% fragmented, 0.00% compressed clusters > Image end offset: 524288 > # ./qemu-img check nounref.qcow2 > No errors were found on the image. > 3/3 = 100.00% allocated, 0.00% fragmented, 0.00% compressed clusters > Image end offset: 524288 > # > # ls -la *.qcow2 > > -rw-r--r-- 1 root root 524288 Apr 16 22:42 nounref.qcow2 > -rw-r--r-- 1 root root 524288 Apr 16 22:41 unref.qcow2 > # du --block-size=1 *.qcow2 > 397312 nounref.qcow2 > 397312 unref.qcow2 > > I understand that by keeping the L2 entry we achieve that cluster > remains formally allocated, but no matter whether "discard-no-unref" > option is enabled fallocate(FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE) is > being called leaving a hole in the file (e.g. file becomes sparse). > However you say in the comment above that we can't allow making new > holes in the file when this option is enabled. How does that correlate > and what do we achieve? And which logic do you think we need to follow > when discarding separate subclusters? > > Andrey
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 7c6fa5524c..cf40f2dc12 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, return nb_clusters; } +static int coroutine_fn GRAPH_RDLOCK +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, + uint64_t nb_subclusters, + enum qcow2_discard_type type, + bool full_discard, + SubClusterRangeInfo *pscri) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t new_l2_bitmap, l2_bitmap_mask; + int ret, sc = offset_to_sc_index(s, offset); + SubClusterRangeInfo scri = { 0 }; + + if (!pscri) { + ret = get_sc_range_info(bs, offset, nb_subclusters, &scri); + if (ret < 0) { + goto out; + } + } else { + scri = *pscri; + } + + l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); + new_l2_bitmap = scri.l2_bitmap; + new_l2_bitmap &= ~l2_bitmap_mask; + + /* + * If there're no allocated subclusters left, we might as well discard + * the entire cluster. That way we'd also update the refcount table. + */ + if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) { + return discard_in_l2_slice(bs, + QEMU_ALIGN_DOWN(offset, s->cluster_size), + 1, type, full_discard); + } + + /* + * Full discard means we fall through to the backing file, thus we only + * need to mark the subclusters as deallocated. + * + * Non-full discard means subclusters should be explicitly marked as + * zeroes. In this case QCOW2 specification requires the corresponding + * allocation status bits to be unset as well. If the subclusters are + * deallocated in the first place and there's no backing, the operation + * can be skipped. + */ + if (!full_discard && + (bs->backing || scri.l2_bitmap & l2_bitmap_mask)) { + new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); + } + + if (scri.l2_bitmap != new_l2_bitmap) { + set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); + } + + if (s->discard_passthrough[type]) { + qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) + + offset_into_cluster(s, offset), + nb_subclusters * s->subcluster_size); + } + + ret = 0; +out: + qcow2_cache_put(s->l2_table_cache, (void **) &scri.l2_slice); + + return ret; +} + int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, uint64_t bytes, enum qcow2_discard_type type, bool full_discard) @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, BDRVQcow2State *s = bs->opaque; uint64_t end_offset = offset + bytes; uint64_t nb_clusters; + unsigned head, tail; int64_t cleared; int ret; /* Caller must pass aligned values, except at image end */ - assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); - assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || + assert(QEMU_IS_ALIGNED(offset, s->subcluster_size)); + assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) || end_offset == bs->total_sectors << BDRV_SECTOR_BITS); - nb_clusters = size_to_clusters(s, bytes); + head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset; + offset += head; + + tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 : + end_offset - MAX(offset, start_of_cluster(s, end_offset)); + end_offset -= tail; s->cache_discards = true; + if (head) { + ret = discard_l2_subclusters(bs, offset - head, + size_to_subclusters(s, head), type, + full_discard, NULL); + if (ret < 0) { + goto fail; + } + } + /* Each L2 slice is handled by its own loop iteration */ + nb_clusters = size_to_clusters(s, end_offset - offset); + while (nb_clusters > 0) { cleared = discard_in_l2_slice(bs, offset, nb_clusters, type, full_discard); @@ -2074,6 +2159,15 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, offset += (cleared * s->cluster_size); } + if (tail) { + ret = discard_l2_subclusters(bs, end_offset, + size_to_subclusters(s, tail), type, + full_discard, NULL); + if (ret < 0) { + goto fail; + } + } + ret = 0; fail: s->cache_discards = false; diff --git a/block/qcow2.c b/block/qcow2.c index aa01d9e7b5..66961fa59e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1966,7 +1966,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto); } bs->bl.pwrite_zeroes_alignment = s->subcluster_size; - bs->bl.pdiscard_alignment = s->cluster_size; + bs->bl.pdiscard_alignment = s->subcluster_size; } static int GRAPH_UNLOCKED @@ -4102,11 +4102,11 @@ qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) return -ENOTSUP; } - if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) { - assert(bytes < s->cluster_size); + if (!QEMU_IS_ALIGNED(offset | bytes, bs->bl.pdiscard_alignment)) { + assert(bytes < bs->bl.pdiscard_alignment); /* Ignore partial clusters, except for the special case of the * complete partial cluster at the end of an unaligned file */ - if (!QEMU_IS_ALIGNED(offset, s->cluster_size) || + if (!QEMU_IS_ALIGNED(offset, bs->bl.pdiscard_alignment) || offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) { return -ENOTSUP; }
This commit makes the discard operation work on the subcluster level rather than cluster level. It introduces discard_l2_subclusters() function and makes use of it in qcow2 discard implementation, much like it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also changes the qcow2 driver pdiscard_alignment to subcluster_size. That way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) operation and free host disk space. This feature will let us gain additional disk space on guest TRIM/discard requests, especially when using large enough clusters (1M, 2M) with subclusters enabled. Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> --- block/qcow2-cluster.c | 100 ++++++++++++++++++++++++++++++++++++++++-- block/qcow2.c | 8 ++-- 2 files changed, 101 insertions(+), 7 deletions(-)