diff mbox series

[07/13] xfs: Convert to use invalidate_lock

Message ID 20210525135100.11221-7-jack@suse.cz
State New
Headers show
Series fs: Hole punch vs page cache filling races | expand

Commit Message

Jan Kara May 25, 2021, 1:50 p.m. UTC
Use invalidate_lock instead of XFS internal i_mmap_lock. The intended
purpose of invalidate_lock is exactly the same. Note that the locking in
__xfs_filemap_fault() slightly changes as filemap_fault() already takes
invalidate_lock.

Reviewed-by: Christoph Hellwig <hch@lst.de>
CC: <linux-xfs@vger.kernel.org>
CC: "Darrick J. Wong" <darrick.wong@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/xfs/xfs_file.c  | 12 ++++++-----
 fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++--------------------
 fs/xfs/xfs_inode.h |  1 -
 fs/xfs/xfs_super.c |  2 --
 4 files changed, 36 insertions(+), 31 deletions(-)

Comments

Darrick J. Wong May 25, 2021, 9:37 p.m. UTC | #1
On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote:
> Use invalidate_lock instead of XFS internal i_mmap_lock. The intended
> purpose of invalidate_lock is exactly the same. Note that the locking in
> __xfs_filemap_fault() slightly changes as filemap_fault() already takes
> invalidate_lock.
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> CC: <linux-xfs@vger.kernel.org>
> CC: "Darrick J. Wong" <darrick.wong@oracle.com>

It's djwong@kernel.org now.

> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/xfs/xfs_file.c  | 12 ++++++-----
>  fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++--------------------
>  fs/xfs/xfs_inode.h |  1 -
>  fs/xfs/xfs_super.c |  2 --
>  4 files changed, 36 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 396ef36dcd0a..dc9cb5c20549 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1282,7 +1282,7 @@ xfs_file_llseek(
>   *
>   * mmap_lock (MM)
>   *   sb_start_pagefault(vfs, freeze)
> - *     i_mmaplock (XFS - truncate serialisation)
> + *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
>   *       page_lock (MM)
>   *         i_lock (XFS - extent map serialisation)
>   */
> @@ -1303,24 +1303,26 @@ __xfs_filemap_fault(
>  		file_update_time(vmf->vma->vm_file);
>  	}
>  
> -	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  	if (IS_DAX(inode)) {
>  		pfn_t pfn;
>  
> +		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
>  				(write_fault && !vmf->cow_page) ?
>  				 &xfs_direct_write_iomap_ops :
>  				 &xfs_read_iomap_ops);
>  		if (ret & VM_FAULT_NEEDDSYNC)
>  			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
> +		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  	} else {
> -		if (write_fault)
> +		if (write_fault) {
> +			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  			ret = iomap_page_mkwrite(vmf,
>  					&xfs_buffered_write_iomap_ops);
> -		else
> +			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +		} else
>  			ret = filemap_fault(vmf);
>  	}
> -	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  
>  	if (write_fault)
>  		sb_end_pagefault(inode->i_sb);
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 0369eb22c1bb..53bb5fc33621 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -131,7 +131,7 @@ xfs_ilock_attr_map_shared(
>  
>  /*
>   * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
> - * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
> + * multi-reader locks: invalidate_lock and the i_lock.  This routine allows
>   * various combinations of the locks to be obtained.
>   *
>   * The 3 locks should always be ordered so that the IO lock is obtained first,
> @@ -139,23 +139,23 @@ xfs_ilock_attr_map_shared(
>   *
>   * Basic locking order:
>   *
> - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
> + * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
>   *
>   * mmap_lock locking order:
>   *
>   * i_rwsem -> page lock -> mmap_lock
> - * mmap_lock -> i_mmap_lock -> page_lock
> + * mmap_lock -> invalidate_lock -> page_lock
>   *
>   * The difference in mmap_lock locking order mean that we cannot hold the
> - * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
> - * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
> - * in get_user_pages() to map the user pages into the kernel address space for
> - * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
> - * page faults already hold the mmap_lock.
> + * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
> + * can fault in pages during copy in/out (for buffered IO) or require the
> + * mmap_lock in get_user_pages() to map the user pages into the kernel address
> + * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
> + * fault because page faults already hold the mmap_lock.
>   *
>   * Hence to serialise fully against both syscall and mmap based IO, we need to
> - * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
> - * taken in places where we need to invalidate the page cache in a race
> + * take both the i_rwsem and the invalidate_lock. These locks should *only* be
> + * both taken in places where we need to invalidate the page cache in a race
>   * free manner (e.g. truncate, hole punch and other extent manipulation
>   * functions).
>   */
> @@ -187,10 +187,13 @@ xfs_ilock(
>  				 XFS_IOLOCK_DEP(lock_flags));
>  	}
>  
> -	if (lock_flags & XFS_MMAPLOCK_EXCL)
> -		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
> -	else if (lock_flags & XFS_MMAPLOCK_SHARED)
> -		mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
> +	if (lock_flags & XFS_MMAPLOCK_EXCL) {
> +		down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
> +				  XFS_MMAPLOCK_DEP(lock_flags));
> +	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
> +		down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
> +				 XFS_MMAPLOCK_DEP(lock_flags));
> +	}
>  
>  	if (lock_flags & XFS_ILOCK_EXCL)
>  		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
> @@ -239,10 +242,10 @@ xfs_ilock_nowait(
>  	}
>  
>  	if (lock_flags & XFS_MMAPLOCK_EXCL) {
> -		if (!mrtryupdate(&ip->i_mmaplock))
> +		if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
>  			goto out_undo_iolock;
>  	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
> -		if (!mrtryaccess(&ip->i_mmaplock))
> +		if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
>  			goto out_undo_iolock;
>  	}
>  
> @@ -257,9 +260,9 @@ xfs_ilock_nowait(
>  
>  out_undo_mmaplock:
>  	if (lock_flags & XFS_MMAPLOCK_EXCL)
> -		mrunlock_excl(&ip->i_mmaplock);
> +		up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
>  	else if (lock_flags & XFS_MMAPLOCK_SHARED)
> -		mrunlock_shared(&ip->i_mmaplock);
> +		up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
>  out_undo_iolock:
>  	if (lock_flags & XFS_IOLOCK_EXCL)
>  		up_write(&VFS_I(ip)->i_rwsem);
> @@ -306,9 +309,9 @@ xfs_iunlock(
>  		up_read(&VFS_I(ip)->i_rwsem);
>  
>  	if (lock_flags & XFS_MMAPLOCK_EXCL)
> -		mrunlock_excl(&ip->i_mmaplock);
> +		up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
>  	else if (lock_flags & XFS_MMAPLOCK_SHARED)
> -		mrunlock_shared(&ip->i_mmaplock);
> +		up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
>  
>  	if (lock_flags & XFS_ILOCK_EXCL)
>  		mrunlock_excl(&ip->i_lock);
> @@ -334,7 +337,7 @@ xfs_ilock_demote(
>  	if (lock_flags & XFS_ILOCK_EXCL)
>  		mrdemote(&ip->i_lock);
>  	if (lock_flags & XFS_MMAPLOCK_EXCL)
> -		mrdemote(&ip->i_mmaplock);
> +		downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
>  	if (lock_flags & XFS_IOLOCK_EXCL)
>  		downgrade_write(&VFS_I(ip)->i_rwsem);
>  
> @@ -355,8 +358,11 @@ xfs_isilocked(
>  
>  	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
>  		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
> -			return !!ip->i_mmaplock.mr_writer;
> -		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
> +			return !debug_locks ||
> +				lockdep_is_held_type(
> +					&VFS_I(ip)->i_mapping->invalidate_lock,
> +					0);
> +		return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock);

This doesn't look right...

If lockdep is disabled, we always return true for
xfs_isilocked(ip, XFS_MMAPLOCK_EXCL) even if nobody holds the lock?

Granted, you probably just copy-pasted from the IOLOCK_SHARED clause
beneath it.  Er... oh right, preichl was messing with all that...

https://lore.kernel.org/linux-xfs/20201016021005.548850-2-preichl@redhat.com/

I guess I'll go have a look at that again.

--D

>  	}
>  
>  	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index ca826cfba91c..a0e4153efbbe 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -40,7 +40,6 @@ typedef struct xfs_inode {
>  	/* Transaction and locking information. */
>  	struct xfs_inode_log_item *i_itemp;	/* logging information */
>  	mrlock_t		i_lock;		/* inode lock */
> -	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
>  	atomic_t		i_pincount;	/* inode pin count */
>  
>  	/*
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index a2dab05332ac..eeaf44910b5f 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -715,8 +715,6 @@ xfs_fs_inode_init_once(
>  	atomic_set(&ip->i_pincount, 0);
>  	spin_lock_init(&ip->i_flags_lock);
>  
> -	mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
> -		     "xfsino", ip->i_ino);
>  	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
>  		     "xfsino", ip->i_ino);
>  }
> -- 
> 2.26.2
>
Dave Chinner May 25, 2021, 9:40 p.m. UTC | #2
On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote:
> Use invalidate_lock instead of XFS internal i_mmap_lock. The intended
> purpose of invalidate_lock is exactly the same. Note that the locking in
> __xfs_filemap_fault() slightly changes as filemap_fault() already takes
> invalidate_lock.
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> CC: <linux-xfs@vger.kernel.org>
> CC: "Darrick J. Wong" <darrick.wong@oracle.com>
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/xfs/xfs_file.c  | 12 ++++++-----
>  fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++--------------------
>  fs/xfs/xfs_inode.h |  1 -
>  fs/xfs/xfs_super.c |  2 --
>  4 files changed, 36 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 396ef36dcd0a..dc9cb5c20549 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1282,7 +1282,7 @@ xfs_file_llseek(
>   *
>   * mmap_lock (MM)
>   *   sb_start_pagefault(vfs, freeze)
> - *     i_mmaplock (XFS - truncate serialisation)
> + *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
>   *       page_lock (MM)
>   *         i_lock (XFS - extent map serialisation)
>   */
> @@ -1303,24 +1303,26 @@ __xfs_filemap_fault(
>  		file_update_time(vmf->vma->vm_file);
>  	}
>  
> -	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  	if (IS_DAX(inode)) {
>  		pfn_t pfn;
>  
> +		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
>  				(write_fault && !vmf->cow_page) ?
>  				 &xfs_direct_write_iomap_ops :
>  				 &xfs_read_iomap_ops);
>  		if (ret & VM_FAULT_NEEDDSYNC)
>  			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
> +		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  	} else {
> -		if (write_fault)
> +		if (write_fault) {
> +			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  			ret = iomap_page_mkwrite(vmf,
>  					&xfs_buffered_write_iomap_ops);
> -		else
> +			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> +		} else
>  			ret = filemap_fault(vmf);
>  	}
> -	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);

This seems kinda messy. filemap_fault() basically takes the
invalidate lock around the entire operation, it runs, so maybe it
would be cleaner to implement it as:

filemap_fault_locked(vmf)
{
	/* does the filemap fault work */
}

filemap_fault(vmf)
{
	filemap_invalidate_down_read(...)
	ret = filemap_fault_locked(vmf)
	filemap_invalidate_up_read(...)
	return ret;
}

And that means XFS could just call filemap_fault_locked() and not 
have to do all this messy locking just to avoid holding the lock
that filemap_fault has now internalised.

> @@ -355,8 +358,11 @@ xfs_isilocked(
>  
>  	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
>  		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
> -			return !!ip->i_mmaplock.mr_writer;
> -		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
> +			return !debug_locks ||
> +				lockdep_is_held_type(
> +					&VFS_I(ip)->i_mapping->invalidate_lock,
> +					0);
> +		return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock);
>  	}

<sigh>

And so here we are again, losing more of our read vs write debug
checks on debug kernels when lockdep is not enabled....

Can we please add rwsem_is_locked_read() and rwsem_is_locked_write()
wrappers that just look at the rwsem counter value to determine how
the lock is held? Then the mrlock_t can go away entirely....

Cheers,

Dave.
Jan Kara May 26, 2021, 10:18 a.m. UTC | #3
On Tue 25-05-21 14:37:29, Darrick J. Wong wrote:
> On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote:
> > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended
> > purpose of invalidate_lock is exactly the same. Note that the locking in
> > __xfs_filemap_fault() slightly changes as filemap_fault() already takes
> > invalidate_lock.
> > 
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > CC: <linux-xfs@vger.kernel.org>
> > CC: "Darrick J. Wong" <darrick.wong@oracle.com>
> 
> It's djwong@kernel.org now.

OK, updated.

> > @@ -355,8 +358,11 @@ xfs_isilocked(
> >  
> >  	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
> >  		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
> > -			return !!ip->i_mmaplock.mr_writer;
> > -		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
> > +			return !debug_locks ||
> > +				lockdep_is_held_type(
> > +					&VFS_I(ip)->i_mapping->invalidate_lock,
> > +					0);
> > +		return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock);
> 
> This doesn't look right...
> 
> If lockdep is disabled, we always return true for
> xfs_isilocked(ip, XFS_MMAPLOCK_EXCL) even if nobody holds the lock?
> 
> Granted, you probably just copy-pasted from the IOLOCK_SHARED clause
> beneath it.  Er... oh right, preichl was messing with all that...
> 
> https://lore.kernel.org/linux-xfs/20201016021005.548850-2-preichl@redhat.com/

Indeed copy-paste programming ;) It certainly makes the assertions happy
but useless. Should I pull the patch you reference into the series? It
seems to have been uncontroversial and reviewed. Or will you pull the
series to xfs tree so I can just rebase on top?

								Honza
Jan Kara May 26, 2021, 10:20 a.m. UTC | #4
On Wed 26-05-21 07:40:41, Dave Chinner wrote:
> On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote:
> > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended
> > purpose of invalidate_lock is exactly the same. Note that the locking in
> > __xfs_filemap_fault() slightly changes as filemap_fault() already takes
> > invalidate_lock.
> > 
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > CC: <linux-xfs@vger.kernel.org>
> > CC: "Darrick J. Wong" <darrick.wong@oracle.com>
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/xfs/xfs_file.c  | 12 ++++++-----
> >  fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++--------------------
> >  fs/xfs/xfs_inode.h |  1 -
> >  fs/xfs/xfs_super.c |  2 --
> >  4 files changed, 36 insertions(+), 31 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index 396ef36dcd0a..dc9cb5c20549 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -1282,7 +1282,7 @@ xfs_file_llseek(
> >   *
> >   * mmap_lock (MM)
> >   *   sb_start_pagefault(vfs, freeze)
> > - *     i_mmaplock (XFS - truncate serialisation)
> > + *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
> >   *       page_lock (MM)
> >   *         i_lock (XFS - extent map serialisation)
> >   */
> > @@ -1303,24 +1303,26 @@ __xfs_filemap_fault(
> >  		file_update_time(vmf->vma->vm_file);
> >  	}
> >  
> > -	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> >  	if (IS_DAX(inode)) {
> >  		pfn_t pfn;
> >  
> > +		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> >  		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
> >  				(write_fault && !vmf->cow_page) ?
> >  				 &xfs_direct_write_iomap_ops :
> >  				 &xfs_read_iomap_ops);
> >  		if (ret & VM_FAULT_NEEDDSYNC)
> >  			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
> > +		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> >  	} else {
> > -		if (write_fault)
> > +		if (write_fault) {
> > +			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> >  			ret = iomap_page_mkwrite(vmf,
> >  					&xfs_buffered_write_iomap_ops);
> > -		else
> > +			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > +		} else
> >  			ret = filemap_fault(vmf);
> >  	}
> > -	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> 
> This seems kinda messy. filemap_fault() basically takes the
> invalidate lock around the entire operation, it runs, so maybe it
> would be cleaner to implement it as:
> 
> filemap_fault_locked(vmf)
> {
> 	/* does the filemap fault work */
> }
> 
> filemap_fault(vmf)
> {
> 	filemap_invalidate_down_read(...)
> 	ret = filemap_fault_locked(vmf)
> 	filemap_invalidate_up_read(...)
> 	return ret;
> }
> 
> And that means XFS could just call filemap_fault_locked() and not 
> have to do all this messy locking just to avoid holding the lock
> that filemap_fault has now internalised.

Sure, I can do that.

> > @@ -355,8 +358,11 @@ xfs_isilocked(
> >  
> >  	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
> >  		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
> > -			return !!ip->i_mmaplock.mr_writer;
> > -		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
> > +			return !debug_locks ||
> > +				lockdep_is_held_type(
> > +					&VFS_I(ip)->i_mapping->invalidate_lock,
> > +					0);
> > +		return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock);
> >  	}
> 
> <sigh>
> 
> And so here we are again, losing more of our read vs write debug
> checks on debug kernels when lockdep is not enabled....
> 
> Can we please add rwsem_is_locked_read() and rwsem_is_locked_write()
> wrappers that just look at the rwsem counter value to determine how
> the lock is held? Then the mrlock_t can go away entirely....

Apparently someone already did that for XFS as Darrick pointed out. So we
just have to sort out how to merge it.

								Honza
Jan Kara May 26, 2021, 1:42 p.m. UTC | #5
On Wed 26-05-21 12:20:59, Jan Kara wrote:
> On Wed 26-05-21 07:40:41, Dave Chinner wrote:
> > On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote:
> > > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended
> > > purpose of invalidate_lock is exactly the same. Note that the locking in
> > > __xfs_filemap_fault() slightly changes as filemap_fault() already takes
> > > invalidate_lock.
> > > 
> > > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > > CC: <linux-xfs@vger.kernel.org>
> > > CC: "Darrick J. Wong" <darrick.wong@oracle.com>
> > > Signed-off-by: Jan Kara <jack@suse.cz>
> > > ---
> > >  fs/xfs/xfs_file.c  | 12 ++++++-----
> > >  fs/xfs/xfs_inode.c | 52 ++++++++++++++++++++++++++--------------------
> > >  fs/xfs/xfs_inode.h |  1 -
> > >  fs/xfs/xfs_super.c |  2 --
> > >  4 files changed, 36 insertions(+), 31 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index 396ef36dcd0a..dc9cb5c20549 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -1282,7 +1282,7 @@ xfs_file_llseek(
> > >   *
> > >   * mmap_lock (MM)
> > >   *   sb_start_pagefault(vfs, freeze)
> > > - *     i_mmaplock (XFS - truncate serialisation)
> > > + *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
> > >   *       page_lock (MM)
> > >   *         i_lock (XFS - extent map serialisation)
> > >   */
> > > @@ -1303,24 +1303,26 @@ __xfs_filemap_fault(
> > >  		file_update_time(vmf->vma->vm_file);
> > >  	}
> > >  
> > > -	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > >  	if (IS_DAX(inode)) {
> > >  		pfn_t pfn;
> > >  
> > > +		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > >  		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
> > >  				(write_fault && !vmf->cow_page) ?
> > >  				 &xfs_direct_write_iomap_ops :
> > >  				 &xfs_read_iomap_ops);
> > >  		if (ret & VM_FAULT_NEEDDSYNC)
> > >  			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
> > > +		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > >  	} else {
> > > -		if (write_fault)
> > > +		if (write_fault) {
> > > +			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > >  			ret = iomap_page_mkwrite(vmf,
> > >  					&xfs_buffered_write_iomap_ops);
> > > -		else
> > > +			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > +		} else
> > >  			ret = filemap_fault(vmf);
> > >  	}
> > > -	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > 
> > This seems kinda messy. filemap_fault() basically takes the
> > invalidate lock around the entire operation, it runs, so maybe it
> > would be cleaner to implement it as:
> > 
> > filemap_fault_locked(vmf)
> > {
> > 	/* does the filemap fault work */
> > }
> > 
> > filemap_fault(vmf)
> > {
> > 	filemap_invalidate_down_read(...)
> > 	ret = filemap_fault_locked(vmf)
> > 	filemap_invalidate_up_read(...)
> > 	return ret;
> > }
> > 
> > And that means XFS could just call filemap_fault_locked() and not 
> > have to do all this messy locking just to avoid holding the lock
> > that filemap_fault has now internalised.
> 
> Sure, I can do that.

Hum, looking into this in more detail it isn't as easy. There are some
operations inside filemap_fault() that need to be done outside of
invalidate_lock. In particular we call into readahead code which will grab
invalidate_lock for itself. So we'd need to pass in struct
readahead_control whether invalidate_lock is held or not which is IMHO
uglier than what we currently do in __xfs_filemap_fault().

								Honza
Darrick J. Wong May 26, 2021, 3:32 p.m. UTC | #6
On Wed, May 26, 2021 at 12:18:40PM +0200, Jan Kara wrote:
> On Tue 25-05-21 14:37:29, Darrick J. Wong wrote:
> > On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote:
> > > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended
> > > purpose of invalidate_lock is exactly the same. Note that the locking in
> > > __xfs_filemap_fault() slightly changes as filemap_fault() already takes
> > > invalidate_lock.
> > > 
> > > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > > CC: <linux-xfs@vger.kernel.org>
> > > CC: "Darrick J. Wong" <darrick.wong@oracle.com>
> > 
> > It's djwong@kernel.org now.
> 
> OK, updated.
> 
> > > @@ -355,8 +358,11 @@ xfs_isilocked(
> > >  
> > >  	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
> > >  		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
> > > -			return !!ip->i_mmaplock.mr_writer;
> > > -		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
> > > +			return !debug_locks ||
> > > +				lockdep_is_held_type(
> > > +					&VFS_I(ip)->i_mapping->invalidate_lock,
> > > +					0);
> > > +		return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock);
> > 
> > This doesn't look right...
> > 
> > If lockdep is disabled, we always return true for
> > xfs_isilocked(ip, XFS_MMAPLOCK_EXCL) even if nobody holds the lock?
> > 
> > Granted, you probably just copy-pasted from the IOLOCK_SHARED clause
> > beneath it.  Er... oh right, preichl was messing with all that...
> > 
> > https://lore.kernel.org/linux-xfs/20201016021005.548850-2-preichl@redhat.com/
> 
> Indeed copy-paste programming ;) It certainly makes the assertions happy
> but useless. Should I pull the patch you reference into the series? It
> seems to have been uncontroversial and reviewed. Or will you pull the
> series to xfs tree so I can just rebase on top?

The full conversion series introduced assertion failures because lockdep
can't handle some of the ILOCK usage patterns, specifically the fact
that a thread sometimes takes the ILOCK but then hands the inode to a
workqueue to avoid overflowing the first thread's stack.  That's why it
never got merged into the xfs tree.

However, that kind of switcheroo isn't done with the
MMAPLOCK/invalidate_lock, so you could simply pull the patch I linked
above into your series.

--D

> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Jan Kara May 27, 2021, 12:01 p.m. UTC | #7
On Wed 26-05-21 08:32:51, Darrick J. Wong wrote:
> On Wed, May 26, 2021 at 12:18:40PM +0200, Jan Kara wrote:
> > On Tue 25-05-21 14:37:29, Darrick J. Wong wrote:
> > > On Tue, May 25, 2021 at 03:50:44PM +0200, Jan Kara wrote:
> > > > Use invalidate_lock instead of XFS internal i_mmap_lock. The intended
> > > > purpose of invalidate_lock is exactly the same. Note that the locking in
> > > > __xfs_filemap_fault() slightly changes as filemap_fault() already takes
> > > > invalidate_lock.
> > > > 
> > > > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > > > CC: <linux-xfs@vger.kernel.org>
> > > > CC: "Darrick J. Wong" <darrick.wong@oracle.com>
> > > 
> > > It's djwong@kernel.org now.
> > 
> > OK, updated.
> > 
> > > > @@ -355,8 +358,11 @@ xfs_isilocked(
> > > >  
> > > >  	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
> > > >  		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
> > > > -			return !!ip->i_mmaplock.mr_writer;
> > > > -		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
> > > > +			return !debug_locks ||
> > > > +				lockdep_is_held_type(
> > > > +					&VFS_I(ip)->i_mapping->invalidate_lock,
> > > > +					0);
> > > > +		return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock);
> > > 
> > > This doesn't look right...
> > > 
> > > If lockdep is disabled, we always return true for
> > > xfs_isilocked(ip, XFS_MMAPLOCK_EXCL) even if nobody holds the lock?
> > > 
> > > Granted, you probably just copy-pasted from the IOLOCK_SHARED clause
> > > beneath it.  Er... oh right, preichl was messing with all that...
> > > 
> > > https://lore.kernel.org/linux-xfs/20201016021005.548850-2-preichl@redhat.com/
> > 
> > Indeed copy-paste programming ;) It certainly makes the assertions happy
> > but useless. Should I pull the patch you reference into the series? It
> > seems to have been uncontroversial and reviewed. Or will you pull the
> > series to xfs tree so I can just rebase on top?
> 
> The full conversion series introduced assertion failures because lockdep
> can't handle some of the ILOCK usage patterns, specifically the fact
> that a thread sometimes takes the ILOCK but then hands the inode to a
> workqueue to avoid overflowing the first thread's stack.  That's why it
> never got merged into the xfs tree.

I see. Yeah, we do "interesting" dances around lockdep fs-freezing
annotations for AIO as well where the freeze protection is inherited from
submission to completion context (we effectively generate false release
event for lockdep when exiting submit context and false acquire event in
the completion context). It can be done but it's ugly and error prone.

> However, that kind of switcheroo isn't done with the
> MMAPLOCK/invalidate_lock, so you could simply pull the patch I linked
> above into your series.

OK, will do!

								Honza
diff mbox series

Patch

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 396ef36dcd0a..dc9cb5c20549 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1282,7 +1282,7 @@  xfs_file_llseek(
  *
  * mmap_lock (MM)
  *   sb_start_pagefault(vfs, freeze)
- *     i_mmaplock (XFS - truncate serialisation)
+ *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
  *       page_lock (MM)
  *         i_lock (XFS - extent map serialisation)
  */
@@ -1303,24 +1303,26 @@  __xfs_filemap_fault(
 		file_update_time(vmf->vma->vm_file);
 	}
 
-	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
 		pfn_t pfn;
 
+		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
 				(write_fault && !vmf->cow_page) ?
 				 &xfs_direct_write_iomap_ops :
 				 &xfs_read_iomap_ops);
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	} else {
-		if (write_fault)
+		if (write_fault) {
+			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 			ret = iomap_page_mkwrite(vmf,
 					&xfs_buffered_write_iomap_ops);
-		else
+			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+		} else
 			ret = filemap_fault(vmf);
 	}
-	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (write_fault)
 		sb_end_pagefault(inode->i_sb);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0369eb22c1bb..53bb5fc33621 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -131,7 +131,7 @@  xfs_ilock_attr_map_shared(
 
 /*
  * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
- * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
+ * multi-reader locks: invalidate_lock and the i_lock.  This routine allows
  * various combinations of the locks to be obtained.
  *
  * The 3 locks should always be ordered so that the IO lock is obtained first,
@@ -139,23 +139,23 @@  xfs_ilock_attr_map_shared(
  *
  * Basic locking order:
  *
- * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
  *
  * mmap_lock locking order:
  *
  * i_rwsem -> page lock -> mmap_lock
- * mmap_lock -> i_mmap_lock -> page_lock
+ * mmap_lock -> invalidate_lock -> page_lock
  *
  * The difference in mmap_lock locking order mean that we cannot hold the
- * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
- * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_lock.
+ * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
+ * can fault in pages during copy in/out (for buffered IO) or require the
+ * mmap_lock in get_user_pages() to map the user pages into the kernel address
+ * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
+ * fault because page faults already hold the mmap_lock.
  *
  * Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
- * taken in places where we need to invalidate the page cache in a race
+ * take both the i_rwsem and the invalidate_lock. These locks should *only* be
+ * both taken in places where we need to invalidate the page cache in a race
  * free manner (e.g. truncate, hole punch and other extent manipulation
  * functions).
  */
@@ -187,10 +187,13 @@  xfs_ilock(
 				 XFS_IOLOCK_DEP(lock_flags));
 	}
 
-	if (lock_flags & XFS_MMAPLOCK_EXCL)
-		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_MMAPLOCK_SHARED)
-		mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+	if (lock_flags & XFS_MMAPLOCK_EXCL) {
+		down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+				  XFS_MMAPLOCK_DEP(lock_flags));
+	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+		down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+				 XFS_MMAPLOCK_DEP(lock_flags));
+	}
 
 	if (lock_flags & XFS_ILOCK_EXCL)
 		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
@@ -239,10 +242,10 @@  xfs_ilock_nowait(
 	}
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_mmaplock))
+		if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
 			goto out_undo_iolock;
 	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_mmaplock))
+		if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
 			goto out_undo_iolock;
 	}
 
@@ -257,9 +260,9 @@  xfs_ilock_nowait(
 
 out_undo_mmaplock:
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
-		mrunlock_excl(&ip->i_mmaplock);
+		up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 	else if (lock_flags & XFS_MMAPLOCK_SHARED)
-		mrunlock_shared(&ip->i_mmaplock);
+		up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
 out_undo_iolock:
 	if (lock_flags & XFS_IOLOCK_EXCL)
 		up_write(&VFS_I(ip)->i_rwsem);
@@ -306,9 +309,9 @@  xfs_iunlock(
 		up_read(&VFS_I(ip)->i_rwsem);
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
-		mrunlock_excl(&ip->i_mmaplock);
+		up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 	else if (lock_flags & XFS_MMAPLOCK_SHARED)
-		mrunlock_shared(&ip->i_mmaplock);
+		up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
 
 	if (lock_flags & XFS_ILOCK_EXCL)
 		mrunlock_excl(&ip->i_lock);
@@ -334,7 +337,7 @@  xfs_ilock_demote(
 	if (lock_flags & XFS_ILOCK_EXCL)
 		mrdemote(&ip->i_lock);
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
-		mrdemote(&ip->i_mmaplock);
+		downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 	if (lock_flags & XFS_IOLOCK_EXCL)
 		downgrade_write(&VFS_I(ip)->i_rwsem);
 
@@ -355,8 +358,11 @@  xfs_isilocked(
 
 	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
 		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
-			return !!ip->i_mmaplock.mr_writer;
-		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+			return !debug_locks ||
+				lockdep_is_held_type(
+					&VFS_I(ip)->i_mapping->invalidate_lock,
+					0);
+		return rwsem_is_locked(&VFS_I(ip)->i_mapping->invalidate_lock);
 	}
 
 	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca826cfba91c..a0e4153efbbe 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -40,7 +40,6 @@  typedef struct xfs_inode {
 	/* Transaction and locking information. */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
-	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
 
 	/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a2dab05332ac..eeaf44910b5f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -715,8 +715,6 @@  xfs_fs_inode_init_once(
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
 
-	mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
-		     "xfsino", ip->i_ino);
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", ip->i_ino);
 }