diff mbox

[1/4] fs: Improve filesystem freezing handling

Message ID 1326331253-6497-2-git-send-email-jack@suse.cz
State New, archived
Headers show

Commit Message

Jan Kara Jan. 12, 2012, 1:20 a.m. UTC
Currently, exclusion between ->page_mkwrite() and filesystem freezing has been
handled by setting page dirty and then verifying s_frozen. This guaranteed that
either the freezing code sees the faulted page, writes it, and writeprotects it
again or we see s_frozen set and bail out of page fault. This works to protect
from page being marked writeable while filesystem freezing is running but has
an unpleasant artefact of leaving dirty (although unmodified and
writeprotected) pages on frozen filesystem. This artefact then requires
workarounds in writeback code and other places.

Also generally vfs_check_frozen() tests are racy since the filesystem can be
frozen just after the test is performed. Thus in other write paths we can
end up marking some pages or inodes dirty even though filesystem is already
frozen. Again this creates problems with flusher thread hanging on frozen
filesystem.

This patch aims at providing exclusion between write paths which dirty data (we
don't have to worry about metadata since that is handled by filesystems in
->freeze_fs) and filesystem freezing. We implement a writer-freeze read-write
semaphore in the superblock. Write paths which dirty data such as
->block_page_mkwrite() implementations, or ->aio_write() implementations hold
reader side of the semaphore.  Filesystem freezing code holds the writer side.
Only that we don't really want to bounce cachelines of the semaphore between
CPUs for each write happening. So we implement the reader side of the semaphore
as a per-cpu counter and the writer side is implemented using s_frozen
superblock field.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/super.c         |  121 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h |   14 ++++++
 2 files changed, 134 insertions(+), 1 deletions(-)

Comments

Andreas Dilger Jan. 12, 2012, 7:53 p.m. UTC | #1
On 2012-01-11, at 6:20 PM, Jan Kara wrote:
> Currently, exclusion between ->page_mkwrite() and filesystem freezing has been
> handled by setting page dirty and then verifying s_frozen. This guaranteed that
> either the freezing code sees the faulted page, writes it, and writeprotects it
> again or we see s_frozen set and bail out of page fault. This works to protect
> from page being marked writeable while filesystem freezing is running but has
> an unpleasant artefact of leaving dirty (although unmodified and
> writeprotected) pages on frozen filesystem. This artefact then requires
> workarounds in writeback code and other places.
> 
> Also generally vfs_check_frozen() tests are racy since the filesystem can be
> frozen just after the test is performed. Thus in other write paths we can
> end up marking some pages or inodes dirty even though filesystem is already
> frozen. Again this creates problems with flusher thread hanging on frozen
> filesystem.
> 
> This patch aims at providing exclusion between write paths which dirty data (we
> don't have to worry about metadata since that is handled by filesystems in
> ->freeze_fs) and filesystem freezing. We implement a writer-freeze read-write
> semaphore in the superblock. Write paths which dirty data such as
> ->block_page_mkwrite() implementations, or ->aio_write() implementations hold
> reader side of the semaphore.  Filesystem freezing code holds the writer side.
> Only that we don't really want to bounce cachelines of the semaphore between
> CPUs for each write happening. So we implement the reader side of the semaphore
> as a per-cpu counter and the writer side is implemented using s_frozen
> superblock field.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
> fs/super.c         |  121 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> include/linux/fs.h |   14 ++++++
> 2 files changed, 134 insertions(+), 1 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index afd0f1a..c85c64c 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -32,12 +32,15 @@
> #include <linux/backing-dev.h>
> #include <linux/rculist_bl.h>
> #include <linux/cleancache.h>
> +#include <linux/lockdep.h>
> #include "internal.h"
> 
> 
> LIST_HEAD(super_blocks);
> DEFINE_SPINLOCK(sb_lock);
> 
> +static struct lock_class_key sb_writers_key;
> +
> /*
>  * One thing we have to be careful of with a per-sb shrinker is that we don't
>  * drop the last active reference to the superblock from within the shrinker.
> @@ -183,6 +186,13 @@ static struct super_block *alloc_super(struct file_system_type *type)
> 		s->s_shrink.seeks = DEFAULT_SEEKS;
> 		s->s_shrink.shrink = prune_super;
> 		s->s_shrink.batch = 1024;
> +
> +		init_waitqueue_head(&s->s_writers_wait);
> +#ifdef CONFIG_SMP
> +		s->s_page_faults = alloc_percpu(int);
> +#endif
> +		lockdep_init_map(&s->s_writers_lock_map, "sb_writers",
> +				 &sb_writers_key, 0);
> 	}
> out:
> 	return s;
> @@ -1126,6 +1136,84 @@ out:
> }
> 
> /**
> + * sb_start_write - drop write access to a superblock
> + * @sb: the super we wrote to
> + *
> + * Decrement number of writers to the filesystem and wake up possible
> + * waiters wanting to freeze the filesystem.
> + */
> +void sb_end_write(struct super_block *sb)
> +{
> +#ifdef CONFIG_SMP
> +	this_cpu_dec(sb->s_writers);
> +#else
> +	preempt_disable();
> +	sb->s_writers--;
> +	preempt_enable();
> +#endif
> +	/*
> +	 * Make sure s_writers are updated before we wake up waiters in
> +	 * freeze_super().
> +	 */
> +	smp_mb();
> +	if (waitqueue_active(&sb->s_writers_wait))
> +		wake_up(&sb->s_writers_wait);
> +	rwsem_release(&sb->s_writers_lock_map, 1, _RET_IP_);
> +}

Since this function is needed for calling __block_page_mkwrite(), which is
EXPORT_SYMBOL(), both sb_start_write() and sb_end_write() themselves need
to be EXPORT_SYMBOL().

> +/**
> + * sb_start_write - get write access to a superblock
> + * @sb: the super we write to
> + *
> + * When a process wants to write data to a filesystem (i.e. dirty a page),
> + * it should embed the operation in a sb_start_write() - sb_end_write() pair
> + * to get exclusion against filesystem freezing. This function increments
> + * number of writers to the filesystem and waits if filesystem is frozen until
> + * it is thawed.
> + */
> +void sb_start_write(struct super_block *sb)
> +{
> +retry:
> +	rwsem_acquire_read(&sb->s_writers_lock_map, 0, 0, _RET_IP_);
> +	vfs_check_frozen(sb, SB_FREEZE_WRITE);
> +#ifdef CONFIG_SMP
> +	this_cpu_inc(sb->s_writers);
> +#else
> +	preempt_disable();
> +	sb->s_writers++;
> +	preempt_enable();
> +#endif
> +	/*
> +	 * Make sure s_writers are updated before we check s_frozen.
> +	 * freeze_super() first sets s_frozen and then checks s_writers.
> +	 */
> +	smp_mb();
> +	if (sb->s_frozen != SB_UNFROZEN) {
> +		sb_end_write(sb);
> +		goto retry;
> +	}
> +}
> +
> +/*
> + * Get number of writers to the superblock
> + */
> +static int get_writers_count(struct super_block *sb)
> +{
> +	int writers;
> +#ifdef CONFIG_SMP
> +	int cpu;
> +
> +	writers = 0;
> +	for_each_possible_cpu(cpu) {
> +		writers += *per_cpu_ptr(sb->s_writers, cpu);
> +	}
> +#else
> +	writers = sb->s_writers;
> +#endif
> +	return writers;
> +}
> +
> +/**
>  * freeze_super - lock the filesystem and force it into a consistent state
>  * @sb: the super to lock
>  *
> @@ -1136,6 +1224,7 @@ out:
> int freeze_super(struct super_block *sb)
> {
> 	int ret;
> +	int writers;
> 
> 	atomic_inc(&sb->s_active);
> 	down_write(&sb->s_umount);
> @@ -1151,8 +1240,36 @@ int freeze_super(struct super_block *sb)
> 		return 0;
> 	}
> 
> +	rwsem_acquire(&sb->s_writers_lock_map, 0, 0, _THIS_IP_);
> 	sb->s_frozen = SB_FREEZE_WRITE;
> -	smp_wmb();
> +	/*
> +	 * Now wait for all page faults to finish. ->page_mkwrite()
> +	 * implementations must call vfs_check_frozen() before starting
> +	 * a fault so that we cannot livelock here. Because of that we
> +	 * are guaranteed that from this moment on new ->page_mkwrite()
> +	 * calls will block and we just have to wait for s_page_faults
> +	 * to drop to zero (in a sum).
> +	 */
> +	do {
> +		DEFINE_WAIT(wait);
> +
> +		/*
> +		 * We use a barrier in prepare_to_wait() to separate setting
> +		 * of s_frozen and checking of s_writers
> +		 */
> +		prepare_to_wait(&sb->s_writers_wait, &wait,
> +				TASK_UNINTERRUPTIBLE);
> +		/*
> +		 * We must iterate over all (even offline) CPUs because of CPU
> + 		 * hotplug their entries could still be non-zero. This is slow
> +		 * when lots of CPUs are configured but hey, filesystem freezing
> +		 * isn't exactly cheap anyway.
> +		 */
> +		writers = get_writers_count(sb);
> +		if (writers)
> +			schedule();
> +		finish_wait(&sb->s_writers_wait, &wait);
> +	} while (writers);
> 
> 	sync_filesystem(sb);
> 
> @@ -1165,6 +1282,7 @@ int freeze_super(struct super_block *sb)
> 		if (ret) {
> 			printk(KERN_ERR
> 				"VFS:Filesystem freeze failed\n");
> +			rwsem_release(&sb->s_writers_lock_map, 1, _THIS_IP_);
> 			sb->s_frozen = SB_UNFROZEN;
> 			deactivate_locked_super(sb);
> 			return ret;
> @@ -1206,6 +1324,7 @@ int thaw_super(struct super_block *sb)
> 	}
> 
> out:
> +	rwsem_release(&sb->s_writers_lock_map, 1, _THIS_IP_);
> 	sb->s_frozen = SB_UNFROZEN;
> 	smp_wmb();
> 	wake_up(&sb->s_wait_unfrozen);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e313022..297b263 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -10,6 +10,7 @@
> #include <linux/ioctl.h>
> #include <linux/blk_types.h>
> #include <linux/types.h>
> +#include <linux/lockdep.h>
> 
> /*
>  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
> @@ -1445,6 +1446,16 @@ struct super_block {
> 
> 	int			s_frozen;
> 	wait_queue_head_t	s_wait_unfrozen;
> +#ifdef CONFIG_SMP
> +	int __percpu 		*s_writers;	/* counter of running writes */
> +#else
> +	int			s_writers;	/* counter of running writes */
> +#endif
> +	wait_queue_head_t	s_writers_wait;	/* queue for waiting for
> +						   writers to finish */
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	struct lockdep_map	s_writers_lock_map;
> +#endif
> 
> 	char s_id[32];				/* Informational name */
> 	u8 s_uuid[16];				/* UUID */
> @@ -1501,6 +1512,9 @@ enum {
> #define vfs_check_frozen(sb, level) \
> 	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
> 
> +void sb_end_write(struct super_block *sb);
> +void sb_start_write(struct super_block *sb);
> +
> /*
>  * until VFS tracks user namespaces for inodes, just make all files
>  * belong to init_user_ns
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas





--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Jan. 12, 2012, 8:07 p.m. UTC | #2
On Thu 12-01-12 12:53:35, Andreas Dilger wrote:
> On 2012-01-11, at 6:20 PM, Jan Kara wrote:
> > /**
> > + * sb_start_write - drop write access to a superblock
> > + * @sb: the super we wrote to
> > + *
> > + * Decrement number of writers to the filesystem and wake up possible
> > + * waiters wanting to freeze the filesystem.
> > + */
> > +void sb_end_write(struct super_block *sb)
> > +{
> > +#ifdef CONFIG_SMP
> > +	this_cpu_dec(sb->s_writers);
> > +#else
> > +	preempt_disable();
> > +	sb->s_writers--;
> > +	preempt_enable();
> > +#endif
> > +	/*
> > +	 * Make sure s_writers are updated before we wake up waiters in
> > +	 * freeze_super().
> > +	 */
> > +	smp_mb();
> > +	if (waitqueue_active(&sb->s_writers_wait))
> > +		wake_up(&sb->s_writers_wait);
> > +	rwsem_release(&sb->s_writers_lock_map, 1, _RET_IP_);
> > +}
> 
> Since this function is needed for calling __block_page_mkwrite(), which is
> EXPORT_SYMBOL(), both sb_start_write() and sb_end_write() themselves need
> to be EXPORT_SYMBOL().
  Good point. Fixed. Thanks.

								Honza
Eric Sandeen Jan. 12, 2012, 10:57 p.m. UTC | #3
On 1/11/12 7:20 PM, Jan Kara wrote:
> Currently, exclusion between ->page_mkwrite() and filesystem freezing has been
> handled by setting page dirty and then verifying s_frozen. This guaranteed that
> either the freezing code sees the faulted page, writes it, and writeprotects it
> again or we see s_frozen set and bail out of page fault. This works to protect
> from page being marked writeable while filesystem freezing is running but has
> an unpleasant artefact of leaving dirty (although unmodified and
> writeprotected) pages on frozen filesystem. This artefact then requires
> workarounds in writeback code and other places.
> 
> Also generally vfs_check_frozen() tests are racy since the filesystem can be
> frozen just after the test is performed. Thus in other write paths we can
> end up marking some pages or inodes dirty even though filesystem is already
> frozen. Again this creates problems with flusher thread hanging on frozen
> filesystem.
> 
> This patch aims at providing exclusion between write paths which dirty data (we
> don't have to worry about metadata since that is handled by filesystems in
> ->freeze_fs) and filesystem freezing. We implement a writer-freeze read-write
> semaphore in the superblock. Write paths which dirty data such as
> ->block_page_mkwrite() implementations, or ->aio_write() implementations hold
> reader side of the semaphore.  Filesystem freezing code holds the writer side.
> Only that we don't really want to bounce cachelines of the semaphore between
> CPUs for each write happening. So we implement the reader side of the semaphore
> as a per-cpu counter and the writer side is implemented using s_frozen
> superblock field.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/super.c         |  121 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/fs.h |   14 ++++++
>  2 files changed, 134 insertions(+), 1 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index afd0f1a..c85c64c 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -32,12 +32,15 @@
>  #include <linux/backing-dev.h>
>  #include <linux/rculist_bl.h>
>  #include <linux/cleancache.h>
> +#include <linux/lockdep.h>
>  #include "internal.h"
>  
>  
>  LIST_HEAD(super_blocks);
>  DEFINE_SPINLOCK(sb_lock);
>  
> +static struct lock_class_key sb_writers_key;
> +
>  /*
>   * One thing we have to be careful of with a per-sb shrinker is that we don't
>   * drop the last active reference to the superblock from within the shrinker.
> @@ -183,6 +186,13 @@ static struct super_block *alloc_super(struct file_system_type *type)
>  		s->s_shrink.seeks = DEFAULT_SEEKS;
>  		s->s_shrink.shrink = prune_super;
>  		s->s_shrink.batch = 1024;
> +
> +		init_waitqueue_head(&s->s_writers_wait);
> +#ifdef CONFIG_SMP
> +		s->s_page_faults = alloc_percpu(int);

isn't this s->s_writers?  s->s_page_faults isn't defined anywhere.

> +#endif
> +		lockdep_init_map(&s->s_writers_lock_map, "sb_writers",
> +				 &sb_writers_key, 0);
>  	}
>  out:
>  	return s;
> @@ -1126,6 +1136,84 @@ out:
>  }
>  
>  /**
> + * sb_start_write - drop write access to a superblock
      ^^^^^^^^^^^^^^

s/b sb_end_write

> + * @sb: the super we wrote to
> + *
> + * Decrement number of writers to the filesystem and wake up possible
> + * waiters wanting to freeze the filesystem.
> + */
> +void sb_end_write(struct super_block *sb)
> +{
> +#ifdef CONFIG_SMP
> +	this_cpu_dec(sb->s_writers);
> +#else
> +	preempt_disable();
> +	sb->s_writers--;
> +	preempt_enable();
> +#endif
> +	/*
> +	 * Make sure s_writers are updated before we wake up waiters in
> +	 * freeze_super().
> +	 */
> +	smp_mb();
> +	if (waitqueue_active(&sb->s_writers_wait))
> +		wake_up(&sb->s_writers_wait);
> +	rwsem_release(&sb->s_writers_lock_map, 1, _RET_IP_);
> +}
> +
> +/**
> + * sb_start_write - get write access to a superblock
> + * @sb: the super we write to
> + *
> + * When a process wants to write data to a filesystem (i.e. dirty a page),
> + * it should embed the operation in a sb_start_write() - sb_end_write() pair
> + * to get exclusion against filesystem freezing. This function increments
> + * number of writers to the filesystem and waits if filesystem is frozen until
> + * it is thawed.
> + */
> +void sb_start_write(struct super_block *sb)
> +{
> +retry:
> +	rwsem_acquire_read(&sb->s_writers_lock_map, 0, 0, _RET_IP_);
> +	vfs_check_frozen(sb, SB_FREEZE_WRITE);
> +#ifdef CONFIG_SMP
> +	this_cpu_inc(sb->s_writers);
> +#else
> +	preempt_disable();
> +	sb->s_writers++;
> +	preempt_enable();
> +#endif
> +	/*
> +	 * Make sure s_writers are updated before we check s_frozen.
> +	 * freeze_super() first sets s_frozen and then checks s_writers.
> +	 */
> +	smp_mb();
> +	if (sb->s_frozen != SB_UNFROZEN) {
> +		sb_end_write(sb);
> +		goto retry;
> +	}
> +}
> +
> +/*
> + * Get number of writers to the superblock
> + */
> +static int get_writers_count(struct super_block *sb)
> +{
> +	int writers;
> +#ifdef CONFIG_SMP
> +	int cpu;
> +
> +	writers = 0;
> +	for_each_possible_cpu(cpu) {
> +		writers += *per_cpu_ptr(sb->s_writers, cpu);
> +	}
> +#else
> +	writers = sb->s_writers;
> +#endif
> +	return writers;
> +}
> +
> +/**
>   * freeze_super - lock the filesystem and force it into a consistent state
>   * @sb: the super to lock
>   *
> @@ -1136,6 +1224,7 @@ out:
>  int freeze_super(struct super_block *sb)
>  {
>  	int ret;
> +	int writers;
>  
>  	atomic_inc(&sb->s_active);
>  	down_write(&sb->s_umount);
> @@ -1151,8 +1240,36 @@ int freeze_super(struct super_block *sb)
>  		return 0;
>  	}
>  
> +	rwsem_acquire(&sb->s_writers_lock_map, 0, 0, _THIS_IP_);
>  	sb->s_frozen = SB_FREEZE_WRITE;
> -	smp_wmb();
> +	/*
> +	 * Now wait for all page faults to finish. ->page_mkwrite()
> +	 * implementations must call vfs_check_frozen() before starting
> +	 * a fault so that we cannot livelock here. Because of that we
> +	 * are guaranteed that from this moment on new ->page_mkwrite()
> +	 * calls will block and we just have to wait for s_page_faults

wait for s_writers, right?

> +	 * to drop to zero (in a sum).
> +	 */
> +	do {
> +		DEFINE_WAIT(wait);
> +
> +		/*
> +		 * We use a barrier in prepare_to_wait() to separate setting
> +		 * of s_frozen and checking of s_writers
> +		 */
> +		prepare_to_wait(&sb->s_writers_wait, &wait,
> +				TASK_UNINTERRUPTIBLE);
> +		/*
> +		 * We must iterate over all (even offline) CPUs because of CPU
> + 		 * hotplug their entries could still be non-zero. This is slow
> +		 * when lots of CPUs are configured but hey, filesystem freezing
> +		 * isn't exactly cheap anyway.
> +		 */
> +		writers = get_writers_count(sb);
> +		if (writers)
> +			schedule();
> +		finish_wait(&sb->s_writers_wait, &wait);
> +	} while (writers);
>  
>  	sync_filesystem(sb);
>  
> @@ -1165,6 +1282,7 @@ int freeze_super(struct super_block *sb)
>  		if (ret) {
>  			printk(KERN_ERR
>  				"VFS:Filesystem freeze failed\n");
> +			rwsem_release(&sb->s_writers_lock_map, 1, _THIS_IP_);
>  			sb->s_frozen = SB_UNFROZEN;
>  			deactivate_locked_super(sb);
>  			return ret;
> @@ -1206,6 +1324,7 @@ int thaw_super(struct super_block *sb)
>  	}
>  
>  out:
> +	rwsem_release(&sb->s_writers_lock_map, 1, _THIS_IP_);
>  	sb->s_frozen = SB_UNFROZEN;
>  	smp_wmb();
>  	wake_up(&sb->s_wait_unfrozen);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e313022..297b263 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -10,6 +10,7 @@
>  #include <linux/ioctl.h>
>  #include <linux/blk_types.h>
>  #include <linux/types.h>
> +#include <linux/lockdep.h>
>  
>  /*
>   * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
> @@ -1445,6 +1446,16 @@ struct super_block {
>  
>  	int			s_frozen;
>  	wait_queue_head_t	s_wait_unfrozen;
> +#ifdef CONFIG_SMP
> +	int __percpu 		*s_writers;	/* counter of running writes */
> +#else
> +	int			s_writers;	/* counter of running writes */
> +#endif
> +	wait_queue_head_t	s_writers_wait;	/* queue for waiting for
> +						   writers to finish */
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	struct lockdep_map	s_writers_lock_map;
> +#endif
>  
>  	char s_id[32];				/* Informational name */
>  	u8 s_uuid[16];				/* UUID */
> @@ -1501,6 +1512,9 @@ enum {
>  #define vfs_check_frozen(sb, level) \
>  	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
>  
> +void sb_end_write(struct super_block *sb);
> +void sb_start_write(struct super_block *sb);
> +
>  /*
>   * until VFS tracks user namespaces for inodes, just make all files
>   * belong to init_user_ns

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Jan. 12, 2012, 11:15 p.m. UTC | #4
On Thu 12-01-12 16:57:42, Eric Sandeen wrote:
> On 1/11/12 7:20 PM, Jan Kara wrote:
> > @@ -183,6 +186,13 @@ static struct super_block *alloc_super(struct file_system_type *type)
> >  		s->s_shrink.seeks = DEFAULT_SEEKS;
> >  		s->s_shrink.shrink = prune_super;
> >  		s->s_shrink.batch = 1024;
> > +
> > +		init_waitqueue_head(&s->s_writers_wait);
> > +#ifdef CONFIG_SMP
> > +		s->s_page_faults = alloc_percpu(int);
> 
> isn't this s->s_writers?  s->s_page_faults isn't defined anywhere.
  Right. Leftover from original implementation and since I was doing
initial testing only using UML, I didn't spot this. Thanks.

> > +#endif
> > +		lockdep_init_map(&s->s_writers_lock_map, "sb_writers",
> > +				 &sb_writers_key, 0);
> >  	}
> >  out:
> >  	return s;
> > @@ -1126,6 +1136,84 @@ out:
> >  }
> >  
> >  /**
> > + * sb_start_write - drop write access to a superblock
>       ^^^^^^^^^^^^^^
> 
> s/b sb_end_write
  Fixed.

> > @@ -1136,6 +1224,7 @@ out:
> >  int freeze_super(struct super_block *sb)
> >  {
> >  	int ret;
> > +	int writers;
> >  
> >  	atomic_inc(&sb->s_active);
> >  	down_write(&sb->s_umount);
> > @@ -1151,8 +1240,36 @@ int freeze_super(struct super_block *sb)
> >  		return 0;
> >  	}
> >  
> > +	rwsem_acquire(&sb->s_writers_lock_map, 0, 0, _THIS_IP_);
> >  	sb->s_frozen = SB_FREEZE_WRITE;
> > -	smp_wmb();
> > +	/*
> > +	 * Now wait for all page faults to finish. ->page_mkwrite()
> > +	 * implementations must call vfs_check_frozen() before starting
> > +	 * a fault so that we cannot livelock here. Because of that we
> > +	 * are guaranteed that from this moment on new ->page_mkwrite()
> > +	 * calls will block and we just have to wait for s_page_faults
> 
> wait for s_writers, right?
  Yes. Fixed.

  Thanks for review.

								Honza
Dave Chinner Jan. 13, 2012, 1:26 a.m. UTC | #5
On Thu, Jan 12, 2012 at 02:20:50AM +0100, Jan Kara wrote:
> + *
> + * Decrement number of writers to the filesystem and wake up possible
> + * waiters wanting to freeze the filesystem.
> + */
> +void sb_end_write(struct super_block *sb)
> +{
> +#ifdef CONFIG_SMP
> +	this_cpu_dec(sb->s_writers);
> +#else
> +	preempt_disable();
> +	sb->s_writers--;
> +	preempt_enable();
> +#endif

I really dislike this type of open coded per-cpu counter
implementation. I can't see that there is no good reason to use it
over percpu_counters here which abstract all this mess away.

i.e. it is relatively rare that the per-cpu count will nest
greater than the percpu_counter batch size (needs more than 32
concurrent blocked active writes per CPU), so there is no
significant overhead to using the percpu_counters here.

Indeed, if there are that many blocked writes per CPU, then the
overhead of an occasional global counter update is going to be lost
in the noise of everything else that is going on.

Cheers,

Dave.
Jan Kara Jan. 13, 2012, 10:12 a.m. UTC | #6
On Fri 13-01-12 12:26:43, Dave Chinner wrote:
> On Thu, Jan 12, 2012 at 02:20:50AM +0100, Jan Kara wrote:
> > + *
> > + * Decrement number of writers to the filesystem and wake up possible
> > + * waiters wanting to freeze the filesystem.
> > + */
> > +void sb_end_write(struct super_block *sb)
> > +{
> > +#ifdef CONFIG_SMP
> > +	this_cpu_dec(sb->s_writers);
> > +#else
> > +	preempt_disable();
> > +	sb->s_writers--;
> > +	preempt_enable();
> > +#endif
> 
> I really dislike this type of open coded per-cpu counter
> implementation. I can't see that there is no good reason to use it
> over percpu_counters here which abstract all this mess away.
> 
> i.e. it is relatively rare that the per-cpu count will nest
> greater than the percpu_counter batch size (needs more than 32
> concurrent blocked active writes per CPU), so there is no
> significant overhead to using the percpu_counters here.
> 
> Indeed, if there are that many blocked writes per CPU, then the
> overhead of an occasional global counter update is going to be lost
> in the noise of everything else that is going on.
  Well, I just did it the way mnt_want_write / mnt_put_write does it. But
you are right that it's unnecessary so it's a good idea to switch the code
to using per-cpu counters. Thanks for the idea.

								Honza
diff mbox

Patch

diff --git a/fs/super.c b/fs/super.c
index afd0f1a..c85c64c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -32,12 +32,15 @@ 
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
 #include <linux/cleancache.h>
+#include <linux/lockdep.h>
 #include "internal.h"
 
 
 LIST_HEAD(super_blocks);
 DEFINE_SPINLOCK(sb_lock);
 
+static struct lock_class_key sb_writers_key;
+
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
@@ -183,6 +186,13 @@  static struct super_block *alloc_super(struct file_system_type *type)
 		s->s_shrink.seeks = DEFAULT_SEEKS;
 		s->s_shrink.shrink = prune_super;
 		s->s_shrink.batch = 1024;
+
+		init_waitqueue_head(&s->s_writers_wait);
+#ifdef CONFIG_SMP
+		s->s_page_faults = alloc_percpu(int);
+#endif
+		lockdep_init_map(&s->s_writers_lock_map, "sb_writers",
+				 &sb_writers_key, 0);
 	}
 out:
 	return s;
@@ -1126,6 +1136,84 @@  out:
 }
 
 /**
+ * sb_start_write - drop write access to a superblock
+ * @sb: the super we wrote to
+ *
+ * Decrement number of writers to the filesystem and wake up possible
+ * waiters wanting to freeze the filesystem.
+ */
+void sb_end_write(struct super_block *sb)
+{
+#ifdef CONFIG_SMP
+	this_cpu_dec(sb->s_writers);
+#else
+	preempt_disable();
+	sb->s_writers--;
+	preempt_enable();
+#endif
+	/*
+	 * Make sure s_writers are updated before we wake up waiters in
+	 * freeze_super().
+	 */
+	smp_mb();
+	if (waitqueue_active(&sb->s_writers_wait))
+		wake_up(&sb->s_writers_wait);
+	rwsem_release(&sb->s_writers_lock_map, 1, _RET_IP_);
+}
+
+/**
+ * sb_start_write - get write access to a superblock
+ * @sb: the super we write to
+ *
+ * When a process wants to write data to a filesystem (i.e. dirty a page),
+ * it should embed the operation in a sb_start_write() - sb_end_write() pair
+ * to get exclusion against filesystem freezing. This function increments
+ * number of writers to the filesystem and waits if filesystem is frozen until
+ * it is thawed.
+ */
+void sb_start_write(struct super_block *sb)
+{
+retry:
+	rwsem_acquire_read(&sb->s_writers_lock_map, 0, 0, _RET_IP_);
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+#ifdef CONFIG_SMP
+	this_cpu_inc(sb->s_writers);
+#else
+	preempt_disable();
+	sb->s_writers++;
+	preempt_enable();
+#endif
+	/*
+	 * Make sure s_writers are updated before we check s_frozen.
+	 * freeze_super() first sets s_frozen and then checks s_writers.
+	 */
+	smp_mb();
+	if (sb->s_frozen != SB_UNFROZEN) {
+		sb_end_write(sb);
+		goto retry;
+	}
+}
+
+/*
+ * Get number of writers to the superblock
+ */
+static int get_writers_count(struct super_block *sb)
+{
+	int writers;
+#ifdef CONFIG_SMP
+	int cpu;
+
+	writers = 0;
+	for_each_possible_cpu(cpu) {
+		writers += *per_cpu_ptr(sb->s_writers, cpu);
+	}
+#else
+	writers = sb->s_writers;
+#endif
+	return writers;
+}
+
+/**
  * freeze_super - lock the filesystem and force it into a consistent state
  * @sb: the super to lock
  *
@@ -1136,6 +1224,7 @@  out:
 int freeze_super(struct super_block *sb)
 {
 	int ret;
+	int writers;
 
 	atomic_inc(&sb->s_active);
 	down_write(&sb->s_umount);
@@ -1151,8 +1240,36 @@  int freeze_super(struct super_block *sb)
 		return 0;
 	}
 
+	rwsem_acquire(&sb->s_writers_lock_map, 0, 0, _THIS_IP_);
 	sb->s_frozen = SB_FREEZE_WRITE;
-	smp_wmb();
+	/*
+	 * Now wait for all page faults to finish. ->page_mkwrite()
+	 * implementations must call vfs_check_frozen() before starting
+	 * a fault so that we cannot livelock here. Because of that we
+	 * are guaranteed that from this moment on new ->page_mkwrite()
+	 * calls will block and we just have to wait for s_page_faults
+	 * to drop to zero (in a sum).
+	 */
+	do {
+		DEFINE_WAIT(wait);
+
+		/*
+		 * We use a barrier in prepare_to_wait() to separate setting
+		 * of s_frozen and checking of s_writers
+		 */
+		prepare_to_wait(&sb->s_writers_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		/*
+		 * We must iterate over all (even offline) CPUs because of CPU
+ 		 * hotplug their entries could still be non-zero. This is slow
+		 * when lots of CPUs are configured but hey, filesystem freezing
+		 * isn't exactly cheap anyway.
+		 */
+		writers = get_writers_count(sb);
+		if (writers)
+			schedule();
+		finish_wait(&sb->s_writers_wait, &wait);
+	} while (writers);
 
 	sync_filesystem(sb);
 
@@ -1165,6 +1282,7 @@  int freeze_super(struct super_block *sb)
 		if (ret) {
 			printk(KERN_ERR
 				"VFS:Filesystem freeze failed\n");
+			rwsem_release(&sb->s_writers_lock_map, 1, _THIS_IP_);
 			sb->s_frozen = SB_UNFROZEN;
 			deactivate_locked_super(sb);
 			return ret;
@@ -1206,6 +1324,7 @@  int thaw_super(struct super_block *sb)
 	}
 
 out:
+	rwsem_release(&sb->s_writers_lock_map, 1, _THIS_IP_);
 	sb->s_frozen = SB_UNFROZEN;
 	smp_wmb();
 	wake_up(&sb->s_wait_unfrozen);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e313022..297b263 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@ 
 #include <linux/ioctl.h>
 #include <linux/blk_types.h>
 #include <linux/types.h>
+#include <linux/lockdep.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -1445,6 +1446,16 @@  struct super_block {
 
 	int			s_frozen;
 	wait_queue_head_t	s_wait_unfrozen;
+#ifdef CONFIG_SMP
+	int __percpu 		*s_writers;	/* counter of running writes */
+#else
+	int			s_writers;	/* counter of running writes */
+#endif
+	wait_queue_head_t	s_writers_wait;	/* queue for waiting for
+						   writers to finish */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	s_writers_lock_map;
+#endif
 
 	char s_id[32];				/* Informational name */
 	u8 s_uuid[16];				/* UUID */
@@ -1501,6 +1512,9 @@  enum {
 #define vfs_check_frozen(sb, level) \
 	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
 
+void sb_end_write(struct super_block *sb);
+void sb_start_write(struct super_block *sb);
+
 /*
  * until VFS tracks user namespaces for inodes, just make all files
  * belong to init_user_ns