Message ID | 201108112331.13241.rjw@sisk.pl |
---|---|
State | New, archived |
Headers | show |
Hi, On Thu 11-08-11 23:31:13, Rafael J. Wysocki wrote: > From: Rafael J. Wysocki <rjw@sisk.pl> > > In analogy with ext4 make ext3_freeze() always call > journal_unlock_updates() to prevent it from leaving a locked mutex > behind. Accordingly, modify ext3_unfreeze() so that it doesn't > call journal_unlock_updates() any more. > > Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> > --- > > Sorry for the duplicate, the previous one was sent too early. > > --- > fs/ext3/super.c | 39 ++++++++++++++++++--------------------- > 1 file changed, 18 insertions(+), 21 deletions(-) > > Index: linux/fs/ext3/super.c > =================================================================== > --- linux.orig/fs/ext3/super.c > +++ linux/fs/ext3/super.c > @@ -2535,30 +2535,28 @@ static int ext3_sync_fs(struct super_blo > */ > static int ext3_freeze(struct super_block *sb) > { > - int error = 0; > + int error; > journal_t *journal; > > - if (!(sb->s_flags & MS_RDONLY)) { > - journal = EXT3_SB(sb)->s_journal; > + if (sb->s_flags & MS_RDONLY) > + return 0; > > - /* Now we set up the journal barrier. */ > - journal_lock_updates(journal); > + journal = EXT3_SB(sb)->s_journal; > > - /* > - * We don't want to clear needs_recovery flag when we failed > - * to flush the journal. > - */ > - error = journal_flush(journal); > - if (error < 0) > - goto out; > - > - /* Journal blocked and flushed, clear needs_recovery flag. */ > - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > - error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > - if (error) > - goto out; > - } > - return 0; > + /* Now we set up the journal barrier. */ > + journal_lock_updates(journal); > + > + /* > + * We don't want to clear needs_recovery flag when we failed > + * to flush the journal. > + */ > + error = journal_flush(journal); > + if (error < 0) > + goto out; > + > + /* Journal blocked and flushed, clear needs_recovery flag. */ > + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > > out: > journal_unlock_updates(journal); > @@ -2577,7 +2575,6 @@ static int ext3_unfreeze(struct super_bl > EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > unlock_super(sb); > - journal_unlock_updates(EXT3_SB(sb)->s_journal); > } > return 0; > } It's not so simple as this. Ext3 relies on the mutex (the one hidden in journal_lock_updates()) to make sure that new transaction cannot be started while the filesystem is frozen - that's essentially what makes the filesystem frozen. So if we want to get rid of the mutex we have to achieve blocking by something else - ext4 uses vfs_check_frozen() in ext4_journal_start(). BTW, filesystem freezing never really worked for mmaped writes under ext3 - ext3 would have to implement page_mkwrite() callback for that - so if you want to rely on it for suspending, this will be non-trivial. Honza
Hi, On Monday, August 15, 2011, Jan Kara wrote: > Hi, > On Thu 11-08-11 23:31:13, Rafael J. Wysocki wrote: > > From: Rafael J. Wysocki <rjw@sisk.pl> > > > > In analogy with ext4 make ext3_freeze() always call > > journal_unlock_updates() to prevent it from leaving a locked mutex > > behind. Accordingly, modify ext3_unfreeze() so that it doesn't > > call journal_unlock_updates() any more. > > > > Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> > > --- > > > > Sorry for the duplicate, the previous one was sent too early. > > > > --- > > fs/ext3/super.c | 39 ++++++++++++++++++--------------------- > > 1 file changed, 18 insertions(+), 21 deletions(-) > > > > Index: linux/fs/ext3/super.c > > =================================================================== > > --- linux.orig/fs/ext3/super.c > > +++ linux/fs/ext3/super.c > > @@ -2535,30 +2535,28 @@ static int ext3_sync_fs(struct super_blo > > */ > > static int ext3_freeze(struct super_block *sb) > > { > > - int error = 0; > > + int error; > > journal_t *journal; > > > > - if (!(sb->s_flags & MS_RDONLY)) { > > - journal = EXT3_SB(sb)->s_journal; > > + if (sb->s_flags & MS_RDONLY) > > + return 0; > > > > - /* Now we set up the journal barrier. */ > > - journal_lock_updates(journal); > > + journal = EXT3_SB(sb)->s_journal; > > > > - /* > > - * We don't want to clear needs_recovery flag when we failed > > - * to flush the journal. > > - */ > > - error = journal_flush(journal); > > - if (error < 0) > > - goto out; > > - > > - /* Journal blocked and flushed, clear needs_recovery flag. */ > > - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > > - error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > > - if (error) > > - goto out; > > - } > > - return 0; > > + /* Now we set up the journal barrier. */ > > + journal_lock_updates(journal); > > + > > + /* > > + * We don't want to clear needs_recovery flag when we failed > > + * to flush the journal. > > + */ > > + error = journal_flush(journal); > > + if (error < 0) > > + goto out; > > + > > + /* Journal blocked and flushed, clear needs_recovery flag. */ > > + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > > + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > > > > out: > > journal_unlock_updates(journal); > > @@ -2577,7 +2575,6 @@ static int ext3_unfreeze(struct super_bl > > EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > > ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > > unlock_super(sb); > > - journal_unlock_updates(EXT3_SB(sb)->s_journal); > > } > > return 0; > > } > It's not so simple as this. Ext3 relies on the mutex (the one hidden in > journal_lock_updates()) to make sure that new transaction cannot be started > while the filesystem is frozen - that's essentially what makes the > filesystem frozen. So if we want to get rid of the mutex we have to achieve > blocking by something else - ext4 uses vfs_check_frozen() in > ext4_journal_start(). I see. Still, freeze_bdev() may be called by user space through a syscall, as far as I can say, so it shouldn't leave the mutex locked. > BTW, filesystem freezing never really worked for mmaped writes under > ext3 - ext3 would have to implement page_mkwrite() callback for that - so > if you want to rely on it for suspending, this will be non-trivial. At this point the purpose of freezing filesystems is basically to prevent XFS from deadlocking with hibernation's memory preallocation. For other filesystems it may or may not make a difference depending on their implementation of freeze/unfreeze_super(). Thanks, Rafael -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hello, On Mon 15-08-11 20:09:13, Rafael J. Wysocki wrote: > On Monday, August 15, 2011, Jan Kara wrote: > > Hi, > > On Thu 11-08-11 23:31:13, Rafael J. Wysocki wrote: > > > From: Rafael J. Wysocki <rjw@sisk.pl> > > > > > > In analogy with ext4 make ext3_freeze() always call > > > journal_unlock_updates() to prevent it from leaving a locked mutex > > > behind. Accordingly, modify ext3_unfreeze() so that it doesn't > > > call journal_unlock_updates() any more. > > > > > > Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> > > > --- > > > > > > Sorry for the duplicate, the previous one was sent too early. > > > > > > --- > > > fs/ext3/super.c | 39 ++++++++++++++++++--------------------- > > > 1 file changed, 18 insertions(+), 21 deletions(-) > > > > > > Index: linux/fs/ext3/super.c > > > =================================================================== > > > --- linux.orig/fs/ext3/super.c > > > +++ linux/fs/ext3/super.c > > > @@ -2535,30 +2535,28 @@ static int ext3_sync_fs(struct super_blo > > > */ > > > static int ext3_freeze(struct super_block *sb) > > > { > > > - int error = 0; > > > + int error; > > > journal_t *journal; > > > > > > - if (!(sb->s_flags & MS_RDONLY)) { > > > - journal = EXT3_SB(sb)->s_journal; > > > + if (sb->s_flags & MS_RDONLY) > > > + return 0; > > > > > > - /* Now we set up the journal barrier. */ > > > - journal_lock_updates(journal); > > > + journal = EXT3_SB(sb)->s_journal; > > > > > > - /* > > > - * We don't want to clear needs_recovery flag when we failed > > > - * to flush the journal. > > > - */ > > > - error = journal_flush(journal); > > > - if (error < 0) > > > - goto out; > > > - > > > - /* Journal blocked and flushed, clear needs_recovery flag. */ > > > - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > > > - error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > > > - if (error) > > > - goto out; > > > - } > > > - return 0; > > > + /* Now we set up the journal barrier. */ > > > + journal_lock_updates(journal); > > > + > > > + /* > > > + * We don't want to clear needs_recovery flag when we failed > > > + * to flush the journal. > > > + */ > > > + error = journal_flush(journal); > > > + if (error < 0) > > > + goto out; > > > + > > > + /* Journal blocked and flushed, clear needs_recovery flag. */ > > > + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > > > + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > > > > > > out: > > > journal_unlock_updates(journal); > > > @@ -2577,7 +2575,6 @@ static int ext3_unfreeze(struct super_bl > > > EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); > > > ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); > > > unlock_super(sb); > > > - journal_unlock_updates(EXT3_SB(sb)->s_journal); > > > } > > > return 0; > > > } > > It's not so simple as this. Ext3 relies on the mutex (the one hidden in > > journal_lock_updates()) to make sure that new transaction cannot be started > > while the filesystem is frozen - that's essentially what makes the > > filesystem frozen. So if we want to get rid of the mutex we have to achieve > > blocking by something else - ext4 uses vfs_check_frozen() in > > ext4_journal_start(). > > I see. Still, freeze_bdev() may be called by user space through a syscall, > as far as I can say, so it shouldn't leave the mutex locked. Yes, I agree with you. That's an ugliness left over from a long time ago. I'll have a look at fixing this... > > BTW, filesystem freezing never really worked for mmaped writes under > > ext3 - ext3 would have to implement page_mkwrite() callback for that - so > > if you want to rely on it for suspending, this will be non-trivial. > > At this point the purpose of freezing filesystems is basically to > prevent XFS from deadlocking with hibernation's memory preallocation. > For other filesystems it may or may not make a difference depending on > their implementation of freeze/unfreeze_super(). What's exactly the problem? Memory preallocation enters direct reclaim and that deadlocks in the filesystem? Honza -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi, On Monday, August 15, 2011, Jan Kara wrote: > Hello, > > On Mon 15-08-11 20:09:13, Rafael J. Wysocki wrote: > > On Monday, August 15, 2011, Jan Kara wrote: ... > > > It's not so simple as this. Ext3 relies on the mutex (the one hidden in > > > journal_lock_updates()) to make sure that new transaction cannot be started > > > while the filesystem is frozen - that's essentially what makes the > > > filesystem frozen. So if we want to get rid of the mutex we have to achieve > > > blocking by something else - ext4 uses vfs_check_frozen() in > > > ext4_journal_start(). > > > > I see. Still, freeze_bdev() may be called by user space through a syscall, > > as far as I can say, so it shouldn't leave the mutex locked. > Yes, I agree with you. That's an ugliness left over from a long time ago. > I'll have a look at fixing this... Thanks! > > > BTW, filesystem freezing never really worked for mmaped writes under > > > ext3 - ext3 would have to implement page_mkwrite() callback for that - so > > > if you want to rely on it for suspending, this will be non-trivial. > > > > At this point the purpose of freezing filesystems is basically to > > prevent XFS from deadlocking with hibernation's memory preallocation. > > For other filesystems it may or may not make a difference depending on > > their implementation of freeze/unfreeze_super(). > What's exactly the problem? Memory preallocation enters direct reclaim > and that deadlocks in the filesystem? Yes, that seems to be the case. Thanks, Rafael -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Aug 15, 2011 at 10:58:07PM +0200, Jan Kara wrote: > Hello, > > On Mon 15-08-11 20:09:13, Rafael J. Wysocki wrote: > > On Monday, August 15, 2011, Jan Kara wrote: > > > BTW, filesystem freezing never really worked for mmaped writes under > > > ext3 - ext3 would have to implement page_mkwrite() callback for that - so > > > if you want to rely on it for suspending, this will be non-trivial. > > > > At this point the purpose of freezing filesystems is basically to > > prevent XFS from deadlocking with hibernation's memory preallocation. > > For other filesystems it may or may not make a difference depending on > > their implementation of freeze/unfreeze_super(). > What's exactly the problem? Memory preallocation enters direct reclaim > and that deadlocks in the filesystem? Well, that's one possible manifestation. The problem is that the current hibernate code still assumes that sys_sync() results in an idle filesystem that will not change after the call if nothing is dirty. The result is that when the large memory allocation occurs for the hibernate image (after the sys_sync() call) then the shrink_slab() tends to be called. The XFS shrinkers are capable of dirtying inodes and the backing buffers of inodes that are in the reclaimable state. But those buffers cannot be flushed to disk because hibernate has already frozen the xfsbufd threads, so the shrinker doing inode reclaim hangs up on locks waiting for the buffers to be written. This either leads to deadlock or hibernate image allocation failure. Far worse, IMO, is the case where is -doesn't- deadlock, because the filesystem state can still changing after the allocation has finished due to async metadata IO completions. That has the potential to cause filesystem corruption as after resume the on-disk state may not match what is written from memory to the hibernate image. The problem really isn't XFS specific, nor is it new - the fact is that any filesystem that has registered a shrinker or can do async work in the background post-sync is vulnerable to this problem. It's just that XFS is the filesystem that usually exposes such issues, so it gets blamed for causing the problem.... Cheers, Dave.
On Tuesday, August 16, 2011, Dave Chinner wrote: > On Mon, Aug 15, 2011 at 10:58:07PM +0200, Jan Kara wrote: > > Hello, > > > > On Mon 15-08-11 20:09:13, Rafael J. Wysocki wrote: > > > On Monday, August 15, 2011, Jan Kara wrote: > > > > BTW, filesystem freezing never really worked for mmaped writes under > > > > ext3 - ext3 would have to implement page_mkwrite() callback for that - so > > > > if you want to rely on it for suspending, this will be non-trivial. > > > > > > At this point the purpose of freezing filesystems is basically to > > > prevent XFS from deadlocking with hibernation's memory preallocation. > > > For other filesystems it may or may not make a difference depending on > > > their implementation of freeze/unfreeze_super(). > > What's exactly the problem? Memory preallocation enters direct reclaim > > and that deadlocks in the filesystem? > > Well, that's one possible manifestation. The problem is that the > current hibernate code still assumes that sys_sync() results in an > idle filesystem that will not change after the call if nothing is > dirty. > > The result is that when the large memory allocation occurs for the > hibernate image (after the sys_sync() call) then the shrink_slab() > tends to be called. The XFS shrinkers are capable of dirtying inodes > and the backing buffers of inodes that are in the reclaimable state. > But those buffers cannot be flushed to disk because hibernate has > already frozen the xfsbufd threads, so the shrinker doing inode > reclaim hangs up on locks waiting for the buffers to be written. > This either leads to deadlock or hibernate image allocation failure. > > Far worse, IMO, is the case where is -doesn't- deadlock, because the > filesystem state can still changing after the allocation has > finished due to async metadata IO completions. That has the > potential to cause filesystem corruption as after resume the on-disk > state may not match what is written from memory to the hibernate > image. > > The problem really isn't XFS specific, nor is it new - the fact is > that any filesystem that has registered a shrinker or can do async > work in the background post-sync is vulnerable to this problem. It's > just that XFS is the filesystem that usually exposes such issues, so > it gets blamed for causing the problem.... I'm not saying it's XFS' fault. It's just that XFS tends to do things that other filesystems don't do and that expose the problem in the hibernate code. Thanks, Rafael -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Aug 22, 2011 at 03:00:45PM +0200, Pavel Machek wrote: > Hi! > > > > What's exactly the problem? Memory preallocation enters direct reclaim > > > and that deadlocks in the filesystem? > > > > Well, that's one possible manifestation. The problem is that the > > current hibernate code still assumes that sys_sync() results in an > > idle filesystem that will not change after the call if nothing is > > dirty. > > > > The result is that when the large memory allocation occurs for the > > hibernate image (after the sys_sync() call) then the shrink_slab() > > tends to be called. The XFS shrinkers are capable of dirtying inodes > > and the backing buffers of inodes that are in the reclaimable state. > > But those buffers cannot be flushed to disk because hibernate has > > already frozen the xfsbufd threads, so the shrinker doing inode > > reclaim hangs up on locks waiting for the buffers to be written. > > This either leads to deadlock or hibernate image allocation failure. > > > > Far worse, IMO, is the case where is -doesn't- deadlock, because the > > filesystem state can still changing after the allocation has > > finished due to async metadata IO completions. That has the > > potential to cause filesystem corruption as after resume the on-disk > > state may not match what is written from memory to the hibernate > > image. > > > > The problem really isn't XFS specific, nor is it new - the fact is > > that any filesystem that has registered a shrinker or can do async > > work in the background post-sync is vulnerable to this problem. It's > > Should we avoid calling shrinkers while hibernating? If you like getting random OOM problems when hibernating, then go for it. Besides, shrinkers are used for more than just filesystems, so you might find you screw entire classes of users by doing this (eg everyone using intel graphics and 3D). > Or put BUG_ON()s into filesystem shrinkers so that this can not > happen? Definitely not. If your concern is filesystem shrinkers and you want a large hammer to hit the problem with then do your hibernate image allocation wih GFP_NOFS and the filesystem shrinkers will abort without doing anything. Cheers, Dave.
On Tuesday, August 23, 2011, Dave Chinner wrote: > On Mon, Aug 22, 2011 at 03:00:45PM +0200, Pavel Machek wrote: > > Hi! > > > > > > What's exactly the problem? Memory preallocation enters direct reclaim > > > > and that deadlocks in the filesystem? > > > > > > Well, that's one possible manifestation. The problem is that the > > > current hibernate code still assumes that sys_sync() results in an > > > idle filesystem that will not change after the call if nothing is > > > dirty. > > > > > > The result is that when the large memory allocation occurs for the > > > hibernate image (after the sys_sync() call) then the shrink_slab() > > > tends to be called. The XFS shrinkers are capable of dirtying inodes > > > and the backing buffers of inodes that are in the reclaimable state. > > > But those buffers cannot be flushed to disk because hibernate has > > > already frozen the xfsbufd threads, so the shrinker doing inode > > > reclaim hangs up on locks waiting for the buffers to be written. > > > This either leads to deadlock or hibernate image allocation failure. > > > > > > Far worse, IMO, is the case where is -doesn't- deadlock, because the > > > filesystem state can still changing after the allocation has > > > finished due to async metadata IO completions. That has the > > > potential to cause filesystem corruption as after resume the on-disk > > > state may not match what is written from memory to the hibernate > > > image. > > > > > > The problem really isn't XFS specific, nor is it new - the fact is > > > that any filesystem that has registered a shrinker or can do async > > > work in the background post-sync is vulnerable to this problem. It's > > > > Should we avoid calling shrinkers while hibernating? > > If you like getting random OOM problems when hibernating, then go > for it. Besides, shrinkers are used for more than just filesystems, > so you might find you screw entire classes of users by doing this > (eg everyone using intel graphics and 3D). > > > Or put BUG_ON()s into filesystem shrinkers so that this can not > > happen? > > Definitely not. If your concern is filesystem shrinkers and you want > a large hammer to hit the problem with then do your hibernate > image allocation wih GFP_NOFS and the filesystem shrinkers will > abort without doing anything. I think we can do that, actually. Thanks, Rafael -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Index: linux/fs/ext3/super.c =================================================================== --- linux.orig/fs/ext3/super.c +++ linux/fs/ext3/super.c @@ -2535,30 +2535,28 @@ static int ext3_sync_fs(struct super_blo */ static int ext3_freeze(struct super_block *sb) { - int error = 0; + int error; journal_t *journal; - if (!(sb->s_flags & MS_RDONLY)) { - journal = EXT3_SB(sb)->s_journal; + if (sb->s_flags & MS_RDONLY) + return 0; - /* Now we set up the journal barrier. */ - journal_lock_updates(journal); + journal = EXT3_SB(sb)->s_journal; - /* - * We don't want to clear needs_recovery flag when we failed - * to flush the journal. - */ - error = journal_flush(journal); - if (error < 0) - goto out; - - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); - if (error) - goto out; - } - return 0; + /* Now we set up the journal barrier. */ + journal_lock_updates(journal); + + /* + * We don't want to clear needs_recovery flag when we failed + * to flush the journal. + */ + error = journal_flush(journal); + if (error < 0) + goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); out: journal_unlock_updates(journal); @@ -2577,7 +2575,6 @@ static int ext3_unfreeze(struct super_bl EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); unlock_super(sb); - journal_unlock_updates(EXT3_SB(sb)->s_journal); } return 0; }