Message ID | 1422470996-20820-2-git-send-email-den@openvz.org |
---|---|
State | New |
Headers | show |
On 28/01/15 21:49, Denis V. Lunev wrote: > The following sequence > int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644); > for (i = 0; i < 100000; i++) > write(fd, buf, 4096); > performs 10% better if buf is aligned to 4096 bytes rather then to > 512 bytes on HDD with 512/4096 logical/physical sector size. > > The difference is quite reliable. > > Signed-off-by: Denis V. Lunev <den@openvz.org> > CC: Kevin Wolf <kwolf@redhat.com> > CC: Stefan Hajnoczi <stefanha@redhat.com> > --- > block.c | 4 ++-- > block/raw-posix.c | 4 ++-- > 2 files changed, 4 insertions(+), 4 deletions(-) > > diff --git a/block.c b/block.c > index d45e4dd..bc5d1e7 100644 > --- a/block.c > +++ b/block.c > @@ -543,7 +543,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) > bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; > bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; > } else { > - bs->bl.opt_mem_alignment = 512; > + bs->bl.opt_mem_alignment = 4096; > } > > if (bs->backing_hd) { > @@ -966,7 +966,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, > > bs->open_flags = flags; > bs->guest_block_size = 512; > - bs->request_alignment = 512; > + bs->request_alignment = 4096; > bs->zero_beyond_eof = true; > open_flags = bdrv_open_flags(bs, flags); > bs->read_only = !(open_flags & BDRV_O_RDWR); > diff --git a/block/raw-posix.c b/block/raw-posix.c > index ec38fee..d1b3388 100644 > --- a/block/raw-posix.c > +++ b/block/raw-posix.c > @@ -266,7 +266,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) > if (!s->buf_align) { > size_t align; > buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); > - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { > + for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) { > if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { > s->buf_align = align; > break; > @@ -278,7 +278,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) > if (!bs->request_alignment) { > size_t align; > buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); > - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { > + for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) { > if (pread(fd, buf, align, 0) >= 0) { > bs->request_alignment = align; > break; sorry, the patch is wrong. It breaks 'make check-block'. I will redo it and perform more testing. request-alignment related changes are wrong :( I have run tests without them but added them as a obvious last minute addition.
On 28/01/2015 19:49, Denis V. Lunev wrote: > The following sequence > int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644); > for (i = 0; i < 100000; i++) > write(fd, buf, 4096); > performs 10% better if buf is aligned to 4096 bytes rather then to > 512 bytes on HDD with 512/4096 logical/physical sector size. > > The difference is quite reliable. The 10% difference, however, is probably not enough to cover the cost of providing a bounce buffer if a guest is (rightfully) using a 512-byte aligned buffer: bs->bl.opt_mem_alignment is in fact badly named and it should be bs->bl.min_mem_alignment instead. Instead, you probably should patch bdrv_opt_mem_align to return at least 4096, and leave the detection logic intact. This will let qemu_blockalign return a properly aligned buffer to qemu-img and other in-process allocations, without negatively affecting the guest. Thanks, Paolo > Signed-off-by: Denis V. Lunev <den@openvz.org> > CC: Kevin Wolf <kwolf@redhat.com> > CC: Stefan Hajnoczi <stefanha@redhat.com> > --- > block.c | 4 ++-- > block/raw-posix.c | 4 ++-- > 2 files changed, 4 insertions(+), 4 deletions(-) > > diff --git a/block.c b/block.c > index d45e4dd..bc5d1e7 100644 > --- a/block.c > +++ b/block.c > @@ -543,7 +543,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) > bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; > bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; > } else { > - bs->bl.opt_mem_alignment = 512; > + bs->bl.opt_mem_alignment = 4096; > } > > if (bs->backing_hd) { > @@ -966,7 +966,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, > > bs->open_flags = flags; > bs->guest_block_size = 512; > - bs->request_alignment = 512; > + bs->request_alignment = 4096; > bs->zero_beyond_eof = true; > open_flags = bdrv_open_flags(bs, flags); > bs->read_only = !(open_flags & BDRV_O_RDWR); > diff --git a/block/raw-posix.c b/block/raw-posix.c > index ec38fee..d1b3388 100644 > --- a/block/raw-posix.c > +++ b/block/raw-posix.c > @@ -266,7 +266,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) > if (!s->buf_align) { > size_t align; > buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); > - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { > + for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) { > if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { > s->buf_align = align; > break; > @@ -278,7 +278,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) > if (!bs->request_alignment) { > size_t align; > buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); > - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { > + for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) { > if (pread(fd, buf, align, 0) >= 0) { > bs->request_alignment = align; > break; >
On 28/01/15 23:07, Paolo Bonzini wrote: > > On 28/01/2015 19:49, Denis V. Lunev wrote: >> The following sequence >> int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644); >> for (i = 0; i < 100000; i++) >> write(fd, buf, 4096); >> performs 10% better if buf is aligned to 4096 bytes rather then to >> 512 bytes on HDD with 512/4096 logical/physical sector size. >> >> The difference is quite reliable. > The 10% difference, however, is probably not enough to cover the cost of > providing a bounce buffer if a guest is (rightfully) using a 512-byte > aligned buffer: bs->bl.opt_mem_alignment is in fact badly named and it > should be bs->bl.min_mem_alignment instead. > > Instead, you probably should patch bdrv_opt_mem_align to return at least > 4096, and leave the detection logic intact. This will let > qemu_blockalign return a properly aligned buffer to qemu-img and other > in-process allocations, without negatively affecting the guest. > > Thanks, > > Paolo ok, this looks good to me :) >> Signed-off-by: Denis V. Lunev <den@openvz.org> >> CC: Kevin Wolf <kwolf@redhat.com> >> CC: Stefan Hajnoczi <stefanha@redhat.com> >> --- >> block.c | 4 ++-- >> block/raw-posix.c | 4 ++-- >> 2 files changed, 4 insertions(+), 4 deletions(-) >> >> diff --git a/block.c b/block.c >> index d45e4dd..bc5d1e7 100644 >> --- a/block.c >> +++ b/block.c >> @@ -543,7 +543,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) >> bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; >> bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; >> } else { >> - bs->bl.opt_mem_alignment = 512; >> + bs->bl.opt_mem_alignment = 4096; >> } >> >> if (bs->backing_hd) { >> @@ -966,7 +966,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, >> >> bs->open_flags = flags; >> bs->guest_block_size = 512; >> - bs->request_alignment = 512; >> + bs->request_alignment = 4096; >> bs->zero_beyond_eof = true; >> open_flags = bdrv_open_flags(bs, flags); >> bs->read_only = !(open_flags & BDRV_O_RDWR); >> diff --git a/block/raw-posix.c b/block/raw-posix.c >> index ec38fee..d1b3388 100644 >> --- a/block/raw-posix.c >> +++ b/block/raw-posix.c >> @@ -266,7 +266,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) >> if (!s->buf_align) { >> size_t align; >> buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); >> - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { >> + for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) { >> if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { >> s->buf_align = align; >> break; >> @@ -278,7 +278,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) >> if (!bs->request_alignment) { >> size_t align; >> buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); >> - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { >> + for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) { >> if (pread(fd, buf, align, 0) >= 0) { >> bs->request_alignment = align; >> break; >>
diff --git a/block.c b/block.c index d45e4dd..bc5d1e7 100644 --- a/block.c +++ b/block.c @@ -543,7 +543,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; } else { - bs->bl.opt_mem_alignment = 512; + bs->bl.opt_mem_alignment = 4096; } if (bs->backing_hd) { @@ -966,7 +966,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, bs->open_flags = flags; bs->guest_block_size = 512; - bs->request_alignment = 512; + bs->request_alignment = 4096; bs->zero_beyond_eof = true; open_flags = bdrv_open_flags(bs, flags); bs->read_only = !(open_flags & BDRV_O_RDWR); diff --git a/block/raw-posix.c b/block/raw-posix.c index ec38fee..d1b3388 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -266,7 +266,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) if (!s->buf_align) { size_t align; buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) { if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { s->buf_align = align; break; @@ -278,7 +278,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) if (!bs->request_alignment) { size_t align; buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); - for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) { if (pread(fd, buf, align, 0) >= 0) { bs->request_alignment = align; break;
The following sequence int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644); for (i = 0; i < 100000; i++) write(fd, buf, 4096); performs 10% better if buf is aligned to 4096 bytes rather then to 512 bytes on HDD with 512/4096 logical/physical sector size. The difference is quite reliable. Signed-off-by: Denis V. Lunev <den@openvz.org> CC: Kevin Wolf <kwolf@redhat.com> CC: Stefan Hajnoczi <stefanha@redhat.com> --- block.c | 4 ++-- block/raw-posix.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-)