Message ID | 20181128204840.4544-1-artem.blagodarenko@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v2] ext2fs: don't read group descriptors during e2label | expand |
> On Nov 28, 2018, at 1:48 PM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote: > > tune2fs is used to make e2label duties. ext2fs_open2() reads group > descriptors which are not used during disk label obtaining, but > takes a lot of time on large partitions. > > This patch change ext2fs_open2(), so only initialized > superblock is returned. without block descriptors. This save > time dramatically. > > Cray-bug-id: LUS-5777 > Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com> One minor nit at the end, but looks fine otherwise: Reviewed-by: Andreas Dilger <adilger@dilger.ca> > > diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h > index 5b87d395..2abb6f76 100644 > --- a/lib/ext2fs/ext2fs.h > +++ b/lib/ext2fs/ext2fs.h > @@ -204,6 +204,7 @@ typedef struct ext2_file *ext2_file_t; > #define EXT2_FLAG_IGNORE_CSUM_ERRORS 0x200000 > #define EXT2_FLAG_SHARE_DUP 0x400000 > #define EXT2_FLAG_IGNORE_SB_ERRORS 0x800000 > +#define EXT2_FLAG_JOURNAL_ONLY 0x1000000 > > /* > * Special flag in the ext2 inode i_flag field that means that this is > diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c > index 85d73e2a..af671070 100644 > --- a/lib/ext2fs/openfs.c > +++ b/lib/ext2fs/openfs.c > @@ -135,6 +135,7 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, > int j; > #endif > char *time_env; > + unsigned long enough_desc_blocks; > > EXT2_CHECK_MAGIC(manager, EXT2_ET_MAGIC_IO_MANAGER); > > @@ -440,12 +441,23 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, > dest += fs->blocksize*first_meta_bg; > } > > - for (i = first_meta_bg ; i < fs->desc_blocks; i++) { > + /* > + * Need superblock and journal inode only. Do not read > + * met_bg group blocks if journal inode group already loaded > + */ > + if (flags & EXT2_FLAG_JOURNAL_ONLY) > + enough_desc_blocks = > + fs->super->s_journal_inum / > + EXT2_INODES_PER_GROUP(fs->super) + 1; > + else > + enough_desc_blocks = fs->desc_blocks; > + > + for (i = first_meta_bg; i < enough_desc_blocks; i++) { > blk = ext2fs_descriptor_block_loc2(fs, group_block, i); > io_channel_cache_readahead(fs->io, blk, 1); > } > > - for (i=first_meta_bg ; i < fs->desc_blocks; i++) { > + for (i = first_meta_bg; i < enough_desc_blocks; i++) { > blk = ext2fs_descriptor_block_loc2(fs, group_block, i); > retval = io_channel_read_blk64(fs->io, blk, 1, dest); > if (retval) > @@ -468,8 +480,12 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, > */ > if (superblock > 1 && ext2fs_has_group_desc_csum(fs)) { > dgrp_t group; > + dgrp_t enough_desc_count; > + > + enough_desc_count = enough_desc_blocks * > + EXT2_DESC_PER_BLOCK(fs->super); > > - for (group = 0; group < fs->group_desc_count; group++) { > + for (group = 0; group < enough_desc_count; group++) { > ext2fs_bg_flags_clear(fs, group, EXT2_BG_BLOCK_UNINIT); > ext2fs_bg_flags_clear(fs, group, EXT2_BG_INODE_UNINIT); > ext2fs_bg_itable_unused_set(fs, group, 0); > diff --git a/misc/tune2fs.c b/misc/tune2fs.c > index cd4057c3..b226fdbf 100644 > --- a/misc/tune2fs.c > +++ b/misc/tune2fs.c > @@ -2882,6 +2882,12 @@ retry_open: > /* keep the filesystem struct around to dump MMP data */ > open_flag |= EXT2_FLAG_NOFREE_ON_ERROR; > > + if (print_label || L_flag) { > + open_flag |= EXT2_FLAG_JOURNAL_ONLY; > + /* don't want metadata to be flushed at close */ > + //open_flag &= ~EXT2_FLAG_RW; > + } > + > retval = ext2fs_open2(device_name, io_options, open_flag, > 0, 0, io_ptr, &fs); > if (retval) { > @@ -2999,7 +3005,6 @@ _("Warning: The journal is dirty. You may wish to replay the journal like:\n\n" > if ((open_flag & EXT2_FLAG_RW) && !(mount_flags & (EXT2_MF_BUSY | EXT2_MF_MOUNTED)) && > ext2fs_has_feature_journal_needs_recovery(fs->super)) { > errcode_t err; > - > printf(_("Recovering journal.\n")); > err = ext2fs_run_ext3_journal(&fs); > if (err) { Not sure why this hunk is here? There should normally be a blank line after local variable declarations. Cheers, Andreas
On Wed, Nov 28, 2018 at 02:16:30PM -0700, Andreas Dilger wrote: > > > On Nov 28, 2018, at 1:48 PM, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote: > > > > tune2fs is used to make e2label duties. ext2fs_open2() reads group > > descriptors which are not used during disk label obtaining, but > > takes a lot of time on large partitions. > > > > This patch change ext2fs_open2(), so only initialized > > superblock is returned. without block descriptors. This save > > time dramatically. > > > > Cray-bug-id: LUS-5777 > > Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com> > > One minor nit at the end, but looks fine otherwise: > > Reviewed-by: Andreas Dilger <adilger@dilger.ca> > > > > > diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h > > index 5b87d395..2abb6f76 100644 > > --- a/lib/ext2fs/ext2fs.h > > +++ b/lib/ext2fs/ext2fs.h > > @@ -204,6 +204,7 @@ typedef struct ext2_file *ext2_file_t; > > #define EXT2_FLAG_IGNORE_CSUM_ERRORS 0x200000 > > #define EXT2_FLAG_SHARE_DUP 0x400000 > > #define EXT2_FLAG_IGNORE_SB_ERRORS 0x800000 > > +#define EXT2_FLAG_JOURNAL_ONLY 0x1000000 > > > > /* > > * Special flag in the ext2 inode i_flag field that means that this is > > diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c > > index 85d73e2a..af671070 100644 > > --- a/lib/ext2fs/openfs.c > > +++ b/lib/ext2fs/openfs.c > > @@ -135,6 +135,7 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, > > int j; > > #endif > > char *time_env; > > + unsigned long enough_desc_blocks; > > > > EXT2_CHECK_MAGIC(manager, EXT2_ET_MAGIC_IO_MANAGER); > > > > @@ -440,12 +441,23 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, > > dest += fs->blocksize*first_meta_bg; > > } > > > > - for (i = first_meta_bg ; i < fs->desc_blocks; i++) { > > + /* > > + * Need superblock and journal inode only. Do not read > > + * met_bg group blocks if journal inode group already loaded > > + */ > > + if (flags & EXT2_FLAG_JOURNAL_ONLY) > > + enough_desc_blocks = > > + fs->super->s_journal_inum / > > + EXT2_INODES_PER_GROUP(fs->super) + 1; > > + else > > + enough_desc_blocks = fs->desc_blocks; > > + > > + for (i = first_meta_bg; i < enough_desc_blocks; i++) { > > blk = ext2fs_descriptor_block_loc2(fs, group_block, i); > > io_channel_cache_readahead(fs->io, blk, 1); > > } > > > > - for (i=first_meta_bg ; i < fs->desc_blocks; i++) { > > + for (i = first_meta_bg; i < enough_desc_blocks; i++) { > > blk = ext2fs_descriptor_block_loc2(fs, group_block, i); > > retval = io_channel_read_blk64(fs->io, blk, 1, dest); > > if (retval) > > @@ -468,8 +480,12 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, > > */ > > if (superblock > 1 && ext2fs_has_group_desc_csum(fs)) { > > dgrp_t group; > > + dgrp_t enough_desc_count; > > + > > + enough_desc_count = enough_desc_blocks * > > + EXT2_DESC_PER_BLOCK(fs->super); > > > > - for (group = 0; group < fs->group_desc_count; group++) { > > + for (group = 0; group < enough_desc_count; group++) { > > ext2fs_bg_flags_clear(fs, group, EXT2_BG_BLOCK_UNINIT); > > ext2fs_bg_flags_clear(fs, group, EXT2_BG_INODE_UNINIT); > > ext2fs_bg_itable_unused_set(fs, group, 0); > > diff --git a/misc/tune2fs.c b/misc/tune2fs.c > > index cd4057c3..b226fdbf 100644 > > --- a/misc/tune2fs.c > > +++ b/misc/tune2fs.c > > @@ -2882,6 +2882,12 @@ retry_open: > > /* keep the filesystem struct around to dump MMP data */ > > open_flag |= EXT2_FLAG_NOFREE_ON_ERROR; > > > > + if (print_label || L_flag) { > > + open_flag |= EXT2_FLAG_JOURNAL_ONLY; > > + /* don't want metadata to be flushed at close */ > > + //open_flag &= ~EXT2_FLAG_RW; Might want to remove this ^^^^^ commented out line too... --D > > + } > > + > > retval = ext2fs_open2(device_name, io_options, open_flag, > > 0, 0, io_ptr, &fs); > > if (retval) { > > @@ -2999,7 +3005,6 @@ _("Warning: The journal is dirty. You may wish to replay the journal like:\n\n" > > if ((open_flag & EXT2_FLAG_RW) && !(mount_flags & (EXT2_MF_BUSY | EXT2_MF_MOUNTED)) && > > ext2fs_has_feature_journal_needs_recovery(fs->super)) { > > errcode_t err; > > - > > printf(_("Recovering journal.\n")); > > err = ext2fs_run_ext3_journal(&fs); > > if (err) { > > Not sure why this hunk is here? There should normally be a blank line after > local variable declarations. > > Cheers, Andreas > > > > >
On Wed, Nov 28, 2018 at 11:48:40PM +0300, Artem Blagodarenko wrote: > tune2fs is used to make e2label duties. ext2fs_open2() reads group > descriptors which are not used during disk label obtaining, but > takes a lot of time on large partitions. Are you trying to speed up using e2label to *read* the label, or e2label to *write* the label? How often do you need to write the label, and do you need to write the label with a dirty journal? I really don't like this patch because it makes the EXT2_FLAG_JOURNAL_ONLY flag an **extremely** hazardous flag to use. If the application uses it wrong, it could lead to extremely serious, disastrous data loss. In fact, I'm going to guess that you only care about reading the label, and you didn't even test using "tune2fs -L new_label /dev/large_file_system_with_precious_data". Because if you did, the tune2fs -L would call ext2fs_mark_super_dirty(), and then ext2fs_close() would call ext2fs_flushfs(), which would write out all of the block groups, writing uninitialized trash into all of the block group descriptor blocks beyond the first one, and your large production file system would be very sad. So, ah, NACK. :-) If what you care about is "print label" case, what I would suggest is a new interface which simply reads the superblock into a struct ext2_super_block and calls ext2fs_swap_super() on it. This will speed up the e2label read case. If you want to speed up the e2label write case when the file system is cleanly unmounted, we could have a new interface that writes out the superblock, and tune2fs would use it if the file system does not need to have a journal replay. But if the journal does need to be replayed, we would need to do a full ext2fs_open2() call. When designing library interfaces, it's always important to imagine how an well-meaning programmer calling them might misuse them. Leaving land-mines for programmers to trip over --- especially when you trip over them yourself in the same patch which introduces the interfaces --- is just not a nice thing to do. For your amusement, I'll leave you with: Rusty Rusell's writeup, "How Do I Make This Hard to Misuse": https://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html And the flip side, his "What If I Don't Actually Like My Users?" https://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html - Ted
Hello Andreas, Darric, Ted, Thanks for review. > Might want to remove this ^^^^^ commented out line too… The answer is bellow in this letter. > On 29 Nov 2018, at 07:10, Theodore Y. Ts'o <tytso@mit.edu> wrote: > > On Wed, Nov 28, 2018 at 11:48:40PM +0300, Artem Blagodarenko wrote: >> tune2fs is used to make e2label duties. ext2fs_open2() reads group >> descriptors which are not used during disk label obtaining, but >> takes a lot of time on large partitions. > > Are you trying to speed up using e2label to *read* the label, or > e2label to *write* the label? How often do you need to write the > label, and do you need to write the label with a dirty journal? > > I really don't like this patch because it makes the > EXT2_FLAG_JOURNAL_ONLY flag an **extremely** hazardous flag to use. > If the application uses it wrong, it could lead to extremely serious, > disastrous data loss. > > In fact, I'm going to guess that you only care about reading the > label, and you didn't even test using "tune2fs -L new_label > /dev/large_file_system_with_precious_data". Because if you did, the > tune2fs -L would call ext2fs_mark_super_dirty(), and then > ext2fs_close() would call ext2fs_flushfs(), which would write out all > of the block groups, writing uninitialized trash into all of the block > group descriptor blocks beyond the first one, and your large > production file system would be very sad. > > So, ah, NACK. :-) I am care about reading and writing. It is quite common operations during cluster setup, start and operation. Tune2fs reads superblock, journal block, fix or return label and terminates. The only thing I am worry is a journaling. Journal can be replied. That can lead to flushing uninitialised data. To solve this problem I added string Darric asked for. > + /* don't want metadata to be flushed at close */ > + //open_flag &= ~EXT2_FLAG_RW; This fixes flushing uninitialised data problem on label reading codepath, but brakes t_replay_and_set test. -Recovering journal. fsck the whole mess +test_filesys: recovering journal Also, I just noticed it brakes label applying. Now I understand that patch is completely bad. I would drop it and write it other way. > If what you care about is "print label" case, what I would suggest is > a new interface which simply reads the superblock into a struct > ext2_super_block and calls ext2fs_swap_super() on it. This will speed > up the e2label read case. > > If you want to speed up the e2label write case when the file system is > cleanly unmounted, we could have a new interface that writes out the > superblock, and tune2fs would use it if the file system does not need > to have a journal replay. But if the journal does need to be > replayed, we would need to do a full ext2fs_open2() call. Thanks. This is good idea for new patch. > > When designing library interfaces, it's always important to imagine > how an well-meaning programmer calling them might misuse them. > Leaving land-mines for programmers to trip over --- especially when > you trip over them yourself in the same patch which introduces the > interfaces --- is just not a nice thing to do. For your amusement, > I'll leave you with: > > Rusty Rusell's writeup, "How Do I Make This Hard to Misuse": > > https://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html > > And the flip side, his "What If I Don't Actually Like My Users?" > > https://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html Thanks for links. I have read it. Very useful. Now I know that e2label -> tune2fs link brakes "The obvious use is (probably) the correct one” rule. I didn’t expect that e2label is not really utility in my system (despite there are sources in tree), but link to tune2fs, then faced with long label acquiring first time. But probably I have wrong expectations. > > - Ted Best regards, Artem Blagodarenko.
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h index 5b87d395..2abb6f76 100644 --- a/lib/ext2fs/ext2fs.h +++ b/lib/ext2fs/ext2fs.h @@ -204,6 +204,7 @@ typedef struct ext2_file *ext2_file_t; #define EXT2_FLAG_IGNORE_CSUM_ERRORS 0x200000 #define EXT2_FLAG_SHARE_DUP 0x400000 #define EXT2_FLAG_IGNORE_SB_ERRORS 0x800000 +#define EXT2_FLAG_JOURNAL_ONLY 0x1000000 /* * Special flag in the ext2 inode i_flag field that means that this is diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c index 85d73e2a..af671070 100644 --- a/lib/ext2fs/openfs.c +++ b/lib/ext2fs/openfs.c @@ -135,6 +135,7 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, int j; #endif char *time_env; + unsigned long enough_desc_blocks; EXT2_CHECK_MAGIC(manager, EXT2_ET_MAGIC_IO_MANAGER); @@ -440,12 +441,23 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, dest += fs->blocksize*first_meta_bg; } - for (i = first_meta_bg ; i < fs->desc_blocks; i++) { + /* + * Need superblock and journal inode only. Do not read + * met_bg group blocks if journal inode group already loaded + */ + if (flags & EXT2_FLAG_JOURNAL_ONLY) + enough_desc_blocks = + fs->super->s_journal_inum / + EXT2_INODES_PER_GROUP(fs->super) + 1; + else + enough_desc_blocks = fs->desc_blocks; + + for (i = first_meta_bg; i < enough_desc_blocks; i++) { blk = ext2fs_descriptor_block_loc2(fs, group_block, i); io_channel_cache_readahead(fs->io, blk, 1); } - for (i=first_meta_bg ; i < fs->desc_blocks; i++) { + for (i = first_meta_bg; i < enough_desc_blocks; i++) { blk = ext2fs_descriptor_block_loc2(fs, group_block, i); retval = io_channel_read_blk64(fs->io, blk, 1, dest); if (retval) @@ -468,8 +480,12 @@ errcode_t ext2fs_open2(const char *name, const char *io_options, */ if (superblock > 1 && ext2fs_has_group_desc_csum(fs)) { dgrp_t group; + dgrp_t enough_desc_count; + + enough_desc_count = enough_desc_blocks * + EXT2_DESC_PER_BLOCK(fs->super); - for (group = 0; group < fs->group_desc_count; group++) { + for (group = 0; group < enough_desc_count; group++) { ext2fs_bg_flags_clear(fs, group, EXT2_BG_BLOCK_UNINIT); ext2fs_bg_flags_clear(fs, group, EXT2_BG_INODE_UNINIT); ext2fs_bg_itable_unused_set(fs, group, 0); diff --git a/misc/tune2fs.c b/misc/tune2fs.c index cd4057c3..b226fdbf 100644 --- a/misc/tune2fs.c +++ b/misc/tune2fs.c @@ -2882,6 +2882,12 @@ retry_open: /* keep the filesystem struct around to dump MMP data */ open_flag |= EXT2_FLAG_NOFREE_ON_ERROR; + if (print_label || L_flag) { + open_flag |= EXT2_FLAG_JOURNAL_ONLY; + /* don't want metadata to be flushed at close */ + //open_flag &= ~EXT2_FLAG_RW; + } + retval = ext2fs_open2(device_name, io_options, open_flag, 0, 0, io_ptr, &fs); if (retval) { @@ -2999,7 +3005,6 @@ _("Warning: The journal is dirty. You may wish to replay the journal like:\n\n" if ((open_flag & EXT2_FLAG_RW) && !(mount_flags & (EXT2_MF_BUSY | EXT2_MF_MOUNTED)) && ext2fs_has_feature_journal_needs_recovery(fs->super)) { errcode_t err; - printf(_("Recovering journal.\n")); err = ext2fs_run_ext3_journal(&fs); if (err) {
tune2fs is used to make e2label duties. ext2fs_open2() reads group descriptors which are not used during disk label obtaining, but takes a lot of time on large partitions. This patch change ext2fs_open2(), so only initialized superblock is returned. without block descriptors. This save time dramatically. Cray-bug-id: LUS-5777 Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com> --- lib/ext2fs/ext2fs.h | 1 + lib/ext2fs/openfs.c | 22 +++++++++++++++++++--- misc/tune2fs.c | 7 ++++++- 3 files changed, 26 insertions(+), 4 deletions(-)