diff mbox series

[10/12] fs/dax: Properly refcount fs dax pages

Message ID 9f4ef8eaba4c80230904da893018ce615b5c24b2.1725941415.git-series.apopple@nvidia.com (mailing list archive)
State Handled Elsewhere, archived
Headers show
Series fs/dax: Fix FS DAX page reference counts | expand

Commit Message

Alistair Popple Sept. 10, 2024, 4:14 a.m. UTC
Currently fs dax pages are considered free when the refcount drops to
one and their refcounts are not increased when mapped via PTEs or
decreased when unmapped. This requires special logic in mm paths to
detect that these pages should not be properly refcounted, and to
detect when the refcount drops to one instead of zero.

On the other hand get_user_pages(), etc. will properly refcount fs dax
pages by taking a reference and dropping it when the page is
unpinned.

Tracking this special behaviour requires extra PTE bits
(eg. pte_devmap) and introduces rules that are potentially confusing
and specific to FS DAX pages. To fix this, and to possibly allow
removal of the special PTE bits in future, convert the fs dax page
refcounts to be zero based and instead take a reference on the page
each time it is mapped as is currently the case for normal pages.

This may also allow a future clean-up to remove the pgmap refcounting
that is currently done in mm/gup.c.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
 drivers/dax/device.c       |  12 +-
 drivers/dax/super.c        |   2 +-
 drivers/nvdimm/pmem.c      |   4 +-
 fs/dax.c                   | 192 ++++++++++++++++++--------------------
 fs/fuse/virtio_fs.c        |   3 +-
 include/linux/dax.h        |   6 +-
 include/linux/mm.h         |  27 +-----
 include/linux/page-flags.h |   6 +-
 mm/gup.c                   |   9 +--
 mm/huge_memory.c           |   6 +-
 mm/internal.h              |   2 +-
 mm/memory-failure.c        |   6 +-
 mm/memory.c                |   6 +-
 mm/memremap.c              |  40 +++-----
 mm/mlock.c                 |   2 +-
 mm/mm_init.c               |   9 +--
 mm/swap.c                  |   2 +-
 17 files changed, 143 insertions(+), 191 deletions(-)

Comments

Dan Williams Sept. 27, 2024, 7:59 a.m. UTC | #1
Alistair Popple wrote:
> Currently fs dax pages are considered free when the refcount drops to
> one and their refcounts are not increased when mapped via PTEs or
> decreased when unmapped. This requires special logic in mm paths to
> detect that these pages should not be properly refcounted, and to
> detect when the refcount drops to one instead of zero.
> 
> On the other hand get_user_pages(), etc. will properly refcount fs dax
> pages by taking a reference and dropping it when the page is
> unpinned.
> 
> Tracking this special behaviour requires extra PTE bits
> (eg. pte_devmap) and introduces rules that are potentially confusing
> and specific to FS DAX pages. To fix this, and to possibly allow
> removal of the special PTE bits in future, convert the fs dax page
> refcounts to be zero based and instead take a reference on the page
> each time it is mapped as is currently the case for normal pages.
> 
> This may also allow a future clean-up to remove the pgmap refcounting
> that is currently done in mm/gup.c.
> 
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> ---
>  drivers/dax/device.c       |  12 +-
>  drivers/dax/super.c        |   2 +-
>  drivers/nvdimm/pmem.c      |   4 +-
>  fs/dax.c                   | 192 ++++++++++++++++++--------------------
>  fs/fuse/virtio_fs.c        |   3 +-
>  include/linux/dax.h        |   6 +-
>  include/linux/mm.h         |  27 +-----
>  include/linux/page-flags.h |   6 +-
>  mm/gup.c                   |   9 +--
>  mm/huge_memory.c           |   6 +-
>  mm/internal.h              |   2 +-
>  mm/memory-failure.c        |   6 +-
>  mm/memory.c                |   6 +-
>  mm/memremap.c              |  40 +++-----
>  mm/mlock.c                 |   2 +-
>  mm/mm_init.c               |   9 +--
>  mm/swap.c                  |   2 +-
>  17 files changed, 143 insertions(+), 191 deletions(-)
> 
> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index 9c1a729..4d3ddd1 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -126,11 +126,11 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
>  		return VM_FAULT_SIGBUS;
>  	}
>  
> -	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
> +	pfn = phys_to_pfn_t(phys, 0);

BTW, this is part of what prompted me to do the pfn_t cleanup [1] that I
will rebase on top of your series:

[1]: http://lore.kernel.org/66f34a9caeb97_2a7f294fa@dwillia2-xfh.jf.intel.com.notmuch

[..]
> @@ -318,85 +323,58 @@ static unsigned long dax_end_pfn(void *entry)
>   */
>  #define for_each_mapped_pfn(entry, pfn) \
>  	for (pfn = dax_to_pfn(entry); \
> -			pfn < dax_end_pfn(entry); pfn++)
> +		pfn < dax_end_pfn(entry); pfn++)
>  
> -static inline bool dax_page_is_shared(struct page *page)
> +static void dax_device_folio_init(struct folio *folio, int order)
>  {
> -	return page->mapping == PAGE_MAPPING_DAX_SHARED;
> -}
> +	int orig_order = folio_order(folio);
> +	int i;
>  
> -/*
> - * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
> - * refcount.
> - */
> -static inline void dax_page_share_get(struct page *page)
> -{
> -	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
> -		/*
> -		 * Reset the index if the page was already mapped
> -		 * regularly before.
> -		 */
> -		if (page->mapping)
> -			page->share = 1;
> -		page->mapping = PAGE_MAPPING_DAX_SHARED;
> -	}
> -	page->share++;
> -}
> +	if (orig_order != order) {
> +		struct dev_pagemap *pgmap = page_dev_pagemap(&folio->page);

Was there a discussion I missed about why the conversion to typical
folios allows the page->share accounting to be dropped.

I assume this is because the page->mapping validation was dropped, which
I think might be useful to keep at least for one development cycle to
make sure this conversion is not triggering any of the old warnings.

Otherwise, the ->share field of 'struct page' can also be cleaned up.

> -static inline unsigned long dax_page_share_put(struct page *page)
> -{
> -	return --page->share;
> -}
> +		for (i = 0; i < (1UL << orig_order); i++) {
> +			struct page *page = folio_page(folio, i);
>  
> -/*
> - * When it is called in dax_insert_entry(), the shared flag will indicate that
> - * whether this entry is shared by multiple files.  If so, set the page->mapping
> - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
> - */
> -static void dax_associate_entry(void *entry, struct address_space *mapping,
> -		struct vm_area_struct *vma, unsigned long address, bool shared)
> -{
> -	unsigned long size = dax_entry_size(entry), pfn, index;
> -	int i = 0;
> +			ClearPageHead(page);
> +			clear_compound_head(page);
>  
> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
> -		return;
> -
> -	index = linear_page_index(vma, address & ~(size - 1));
> -	for_each_mapped_pfn(entry, pfn) {
> -		struct page *page = pfn_to_page(pfn);
> +			/*
> +			 * Reset pgmap which was over-written by
> +			 * prep_compound_page().
> +			 */
> +			page_folio(page)->pgmap = pgmap;
>  
> -		if (shared) {
> -			dax_page_share_get(page);
> -		} else {
> -			WARN_ON_ONCE(page->mapping);
> -			page->mapping = mapping;
> -			page->index = index + i++;
> +			/* Make sure this isn't set to TAIL_MAPPING */
> +			page->mapping = NULL;
>  		}
>  	}
> +
> +	if (order > 0) {
> +		prep_compound_page(&folio->page, order);
> +		if (order > 1)
> +			INIT_LIST_HEAD(&folio->_deferred_list);
> +	}
>  }
>  
> -static void dax_disassociate_entry(void *entry, struct address_space *mapping,
> -		bool trunc)
> +static void dax_associate_new_entry(void *entry, struct address_space *mapping,
> +				pgoff_t index)

Lets call this dax_create_folio(), to mirror filemap_create_folio() and
have it transition the folio refcount from 0 to 1 to indicate that it is
allocated.

While I am not sure anything requires that, it seems odd that page cache
pages have an elevated refcount at map time and dax pages do not.

It does have implications for the dax dma-idle tracking thought, see
below.

>  {
> -	unsigned long pfn;
> +	unsigned long order = dax_entry_order(entry);
> +	struct folio *folio = dax_to_folio(entry);
>  
> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
> +	if (!dax_entry_size(entry))
>  		return;
>  
> -	for_each_mapped_pfn(entry, pfn) {
> -		struct page *page = pfn_to_page(pfn);
> -
> -		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
> -		if (dax_page_is_shared(page)) {
> -			/* keep the shared flag if this page is still shared */
> -			if (dax_page_share_put(page) > 0)
> -				continue;
> -		} else
> -			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
> -		page->mapping = NULL;
> -		page->index = 0;
> -	}
> +	/*
> +	 * We don't hold a reference for the DAX pagecache entry for the
> +	 * page. But we need to initialise the folio so we can hand it
> +	 * out. Nothing else should have a reference either.
> +	 */
> +	WARN_ON_ONCE(folio_ref_count(folio));

Per above I would feel more comfortable if we kept the paranoia around
to ensure that all the pages in this folio have dropped all references
and cleared ->mapping and ->index.

That paranoia can be placed behind a CONFIG_DEBUB_VM check, and we can
delete in a follow-on development cycle, but in the meantime it helps to
prove the correctness of the conversion.

[..]
> @@ -1189,11 +1165,14 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
>  	struct inode *inode = iter->inode;
>  	unsigned long vaddr = vmf->address;
>  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
> +	struct page *page = pfn_t_to_page(pfn);
>  	vm_fault_t ret;
>  
>  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
>  
> -	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
> +	page_ref_inc(page);
> +	ret = dax_insert_pfn(vmf, pfn, false);
> +	put_page(page);

Per above I think it is problematic to have pages live in the system
without a refcount.

One scenario where this might be needed is invalidate_inode_pages() vs
DMA. The invaldation should pause and wait for DMA pins to be dropped
before the mapping xarray is cleaned up and the dax folio is marked
free.

I think this may be a gap in the current code. I'll attempt to write a
test for this to check.

[..]
> @@ -1649,9 +1627,10 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>  	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
>  	bool write = iter->flags & IOMAP_WRITE;
>  	unsigned long entry_flags = pmd ? DAX_PMD : 0;
> -	int err = 0;
> +	int ret, err = 0;
>  	pfn_t pfn;
>  	void *kaddr;
> +	struct page *page;
>  
>  	if (!pmd && vmf->cow_page)
>  		return dax_fault_cow_page(vmf, iter);
> @@ -1684,14 +1663,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>  	if (dax_fault_is_synchronous(iter, vmf->vma))
>  		return dax_fault_synchronous_pfnp(pfnp, pfn);
>  
> -	/* insert PMD pfn */
> +	page = pfn_t_to_page(pfn);

I think this is clearer if dax_insert_entry() returns folios with an
elevated refrence count that is dropped when the folio is invalidated
out of the mapping.

[..]
> @@ -519,21 +529,3 @@ void zone_device_page_init(struct page *page)
>  	lock_page(page);
>  }
>  EXPORT_SYMBOL_GPL(zone_device_page_init);
> -
> -#ifdef CONFIG_FS_DAX
> -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
> -{
> -	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
> -		return false;
> -
> -	/*
> -	 * fsdax page refcounts are 1-based, rather than 0-based: if
> -	 * refcount is 1, then the page is free and the refcount is
> -	 * stable because nobody holds a reference on the page.
> -	 */
> -	if (folio_ref_sub_return(folio, refs) == 1)
> -		wake_up_var(&folio->_refcount);
> -	return true;

It follow from the refcount disvussion above that I think there is an
argument to still keep this wakeup based on the 2->1 transitition.
pagecache pages are refcount==1 when they are dma-idle but still
allocated. To keep the same semantics for dax a dax_folio would have an
elevated refcount whenever it is referenced by mapping entry.
Alistair Popple Oct. 24, 2024, 7:52 a.m. UTC | #2
Dan Williams <dan.j.williams@intel.com> writes:

> Alistair Popple wrote:

[...]

>> @@ -318,85 +323,58 @@ static unsigned long dax_end_pfn(void *entry)
>>   */
>>  #define for_each_mapped_pfn(entry, pfn) \
>>  	for (pfn = dax_to_pfn(entry); \
>> -			pfn < dax_end_pfn(entry); pfn++)
>> +		pfn < dax_end_pfn(entry); pfn++)
>>  
>> -static inline bool dax_page_is_shared(struct page *page)
>> +static void dax_device_folio_init(struct folio *folio, int order)
>>  {
>> -	return page->mapping == PAGE_MAPPING_DAX_SHARED;
>> -}
>> +	int orig_order = folio_order(folio);
>> +	int i;
>>  
>> -/*
>> - * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
>> - * refcount.
>> - */
>> -static inline void dax_page_share_get(struct page *page)
>> -{
>> -	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
>> -		/*
>> -		 * Reset the index if the page was already mapped
>> -		 * regularly before.
>> -		 */
>> -		if (page->mapping)
>> -			page->share = 1;
>> -		page->mapping = PAGE_MAPPING_DAX_SHARED;
>> -	}
>> -	page->share++;
>> -}
>> +	if (orig_order != order) {
>> +		struct dev_pagemap *pgmap = page_dev_pagemap(&folio->page);
>
> Was there a discussion I missed about why the conversion to typical
> folios allows the page->share accounting to be dropped.

The problem with keeping it is we now treat DAX pages as "normal"
pages according to vm_normal_page(). As such we use the normal paths
for unmapping pages.

Specifically page->share accounting relies on PAGE_MAPPING_DAX_SHARED
aka PAGE_MAPPING_ANON which causes folio_test_anon(), PageAnon(),
etc. to return true leading to all sorts of issues in at least the
unmap paths.

There hasn't been a previous discussion on this, but given this is
only used to print warnings it seemed easier to get rid of it. I
probably should have called that out more clearly in the commit
message though.

> I assume this is because the page->mapping validation was dropped, which
> I think might be useful to keep at least for one development cycle to
> make sure this conversion is not triggering any of the old warnings.
>
> Otherwise, the ->share field of 'struct page' can also be cleaned up.

Yes, we should also clean up the ->share field, unless you have an
alternate suggestion to solve the above issue.

>> -static inline unsigned long dax_page_share_put(struct page *page)
>> -{
>> -	return --page->share;
>> -}
>> +		for (i = 0; i < (1UL << orig_order); i++) {
>> +			struct page *page = folio_page(folio, i);
>>  
>> -/*
>> - * When it is called in dax_insert_entry(), the shared flag will indicate that
>> - * whether this entry is shared by multiple files.  If so, set the page->mapping
>> - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
>> - */
>> -static void dax_associate_entry(void *entry, struct address_space *mapping,
>> -		struct vm_area_struct *vma, unsigned long address, bool shared)
>> -{
>> -	unsigned long size = dax_entry_size(entry), pfn, index;
>> -	int i = 0;
>> +			ClearPageHead(page);
>> +			clear_compound_head(page);
>>  
>> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
>> -		return;
>> -
>> -	index = linear_page_index(vma, address & ~(size - 1));
>> -	for_each_mapped_pfn(entry, pfn) {
>> -		struct page *page = pfn_to_page(pfn);
>> +			/*
>> +			 * Reset pgmap which was over-written by
>> +			 * prep_compound_page().
>> +			 */
>> +			page_folio(page)->pgmap = pgmap;
>>  
>> -		if (shared) {
>> -			dax_page_share_get(page);
>> -		} else {
>> -			WARN_ON_ONCE(page->mapping);
>> -			page->mapping = mapping;
>> -			page->index = index + i++;
>> +			/* Make sure this isn't set to TAIL_MAPPING */
>> +			page->mapping = NULL;
>>  		}
>>  	}
>> +
>> +	if (order > 0) {
>> +		prep_compound_page(&folio->page, order);
>> +		if (order > 1)
>> +			INIT_LIST_HEAD(&folio->_deferred_list);
>> +	}
>>  }
>>  
>> -static void dax_disassociate_entry(void *entry, struct address_space *mapping,
>> -		bool trunc)
>> +static void dax_associate_new_entry(void *entry, struct address_space *mapping,
>> +				pgoff_t index)
>
> Lets call this dax_create_folio(), to mirror filemap_create_folio() and
> have it transition the folio refcount from 0 to 1 to indicate that it is
> allocated.
>
> While I am not sure anything requires that, it seems odd that page cache
> pages have an elevated refcount at map time and dax pages do not.

The refcount gets elevated further up the call stack, but I agree it
would be clearer to move it here.

> It does have implications for the dax dma-idle tracking thought, see
> below.
>
>>  {
>> -	unsigned long pfn;
>> +	unsigned long order = dax_entry_order(entry);
>> +	struct folio *folio = dax_to_folio(entry);
>>  
>> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
>> +	if (!dax_entry_size(entry))
>>  		return;
>>  
>> -	for_each_mapped_pfn(entry, pfn) {
>> -		struct page *page = pfn_to_page(pfn);
>> -
>> -		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
>> -		if (dax_page_is_shared(page)) {
>> -			/* keep the shared flag if this page is still shared */
>> -			if (dax_page_share_put(page) > 0)
>> -				continue;
>> -		} else
>> -			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
>> -		page->mapping = NULL;
>> -		page->index = 0;
>> -	}
>> +	/*
>> +	 * We don't hold a reference for the DAX pagecache entry for the
>> +	 * page. But we need to initialise the folio so we can hand it
>> +	 * out. Nothing else should have a reference either.
>> +	 */
>> +	WARN_ON_ONCE(folio_ref_count(folio));
>
> Per above I would feel more comfortable if we kept the paranoia around
> to ensure that all the pages in this folio have dropped all references
> and cleared ->mapping and ->index.
>
> That paranoia can be placed behind a CONFIG_DEBUB_VM check, and we can
> delete in a follow-on development cycle, but in the meantime it helps to
> prove the correctness of the conversion.

I'm ok with paranoia, but as noted above the issue is that at a minimum
page->mapping (and probably index) now needs to be valid for any code
that might walk the page tables.

> [..]
>> @@ -1189,11 +1165,14 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
>>  	struct inode *inode = iter->inode;
>>  	unsigned long vaddr = vmf->address;
>>  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
>> +	struct page *page = pfn_t_to_page(pfn);
>>  	vm_fault_t ret;
>>  
>>  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
>>  
>> -	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
>> +	page_ref_inc(page);
>> +	ret = dax_insert_pfn(vmf, pfn, false);
>> +	put_page(page);
>
> Per above I think it is problematic to have pages live in the system
> without a refcount.

I'm a bit confused by this - the pages have a reference taken on them
when they are mapped. They only live in the system without a refcount
when the mm considers them free (except for the bit between getting
created in dax_associate_entry() and actually getting mapped but as
noted I will fix that).

> One scenario where this might be needed is invalidate_inode_pages() vs
> DMA. The invaldation should pause and wait for DMA pins to be dropped
> before the mapping xarray is cleaned up and the dax folio is marked
> free.

I'm not really following this scenario, or at least how it relates to
the comment above. If the page is pinned for DMA it will have taken a
refcount on it and so the page won't be considered free/idle per
dax_wait_page_idle() or any of the other mm code.

> I think this may be a gap in the current code. I'll attempt to write a
> test for this to check.

Ok, let me know if you come up with anything there as it might help
explain the problem more clearly.

> [..]
>> @@ -1649,9 +1627,10 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>>  	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
>>  	bool write = iter->flags & IOMAP_WRITE;
>>  	unsigned long entry_flags = pmd ? DAX_PMD : 0;
>> -	int err = 0;
>> +	int ret, err = 0;
>>  	pfn_t pfn;
>>  	void *kaddr;
>> +	struct page *page;
>>  
>>  	if (!pmd && vmf->cow_page)
>>  		return dax_fault_cow_page(vmf, iter);
>> @@ -1684,14 +1663,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>>  	if (dax_fault_is_synchronous(iter, vmf->vma))
>>  		return dax_fault_synchronous_pfnp(pfnp, pfn);
>>  
>> -	/* insert PMD pfn */
>> +	page = pfn_t_to_page(pfn);
>
> I think this is clearer if dax_insert_entry() returns folios with an
> elevated refrence count that is dropped when the folio is invalidated
> out of the mapping.

I presume this comment is for the next line:

+	page_ref_inc(page);
 
I can move that into dax_insert_entry(), but we would still need to
drop it after calling vmf_insert_*() to ensure we get the 1 -> 0
transition when the page is unmapped and therefore
freed. Alternatively we can make it so vmf_insert_*() don't take
references on the page, and instead ownership of the reference is
transfered to the mapping. Personally I prefered having those
functions take their own reference but let me know what you think.

> [..]
>> @@ -519,21 +529,3 @@ void zone_device_page_init(struct page *page)
>>  	lock_page(page);
>>  }
>>  EXPORT_SYMBOL_GPL(zone_device_page_init);
>> -
>> -#ifdef CONFIG_FS_DAX
>> -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
>> -{
>> -	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
>> -		return false;
>> -
>> -	/*
>> -	 * fsdax page refcounts are 1-based, rather than 0-based: if
>> -	 * refcount is 1, then the page is free and the refcount is
>> -	 * stable because nobody holds a reference on the page.
>> -	 */
>> -	if (folio_ref_sub_return(folio, refs) == 1)
>> -		wake_up_var(&folio->_refcount);
>> -	return true;
>
> It follow from the refcount disvussion above that I think there is an
> argument to still keep this wakeup based on the 2->1 transitition.
> pagecache pages are refcount==1 when they are dma-idle but still
> allocated. To keep the same semantics for dax a dax_folio would have an
> elevated refcount whenever it is referenced by mapping entry.

I'm not sold on keeping it as it doesn't seem to offer any benefit
IMHO. I know both Jason and Christoph were keen to see it go so it be
good to get their feedback too. Also one of the primary goals of this
series was to refcount the page normally so we could remove the whole
"page is free with a refcount of 1" semantics.

  - Alistair
Dan Williams Oct. 24, 2024, 11:52 p.m. UTC | #3
Alistair Popple wrote:
[..]
> >
> > Was there a discussion I missed about why the conversion to typical
> > folios allows the page->share accounting to be dropped.
> 
> The problem with keeping it is we now treat DAX pages as "normal"
> pages according to vm_normal_page(). As such we use the normal paths
> for unmapping pages.
> 
> Specifically page->share accounting relies on PAGE_MAPPING_DAX_SHARED
> aka PAGE_MAPPING_ANON which causes folio_test_anon(), PageAnon(),
> etc. to return true leading to all sorts of issues in at least the
> unmap paths.

Oh, I missed that PAGE_MAPPING_DAX_SHARED aliases with
PAGE_MAPPING_ANON.

> There hasn't been a previous discussion on this, but given this is
> only used to print warnings it seemed easier to get rid of it. I
> probably should have called that out more clearly in the commit
> message though.
> 
> > I assume this is because the page->mapping validation was dropped, which
> > I think might be useful to keep at least for one development cycle to
> > make sure this conversion is not triggering any of the old warnings.
> >
> > Otherwise, the ->share field of 'struct page' can also be cleaned up.
> 
> Yes, we should also clean up the ->share field, unless you have an
> alternate suggestion to solve the above issue.

kmalloc mininimum alignment is 8, so there is room to do this?

---
diff --git a/fs/dax.c b/fs/dax.c
index c62acd2812f8..a70f081c32cb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -322,7 +322,7 @@ static unsigned long dax_end_pfn(void *entry)
 
 static inline bool dax_page_is_shared(struct page *page)
 {
-	return page->mapping == PAGE_MAPPING_DAX_SHARED;
+	return folio_test_dax_shared(page_folio(page));
 }
 
 /*
@@ -331,14 +331,14 @@ static inline bool dax_page_is_shared(struct page *page)
  */
 static inline void dax_page_share_get(struct page *page)
 {
-	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
+	if (!dax_page_is_shared(page)) {
 		/*
 		 * Reset the index if the page was already mapped
 		 * regularly before.
 		 */
 		if (page->mapping)
 			page->share = 1;
-		page->mapping = PAGE_MAPPING_DAX_SHARED;
+		page->mapping = (void *)PAGE_MAPPING_DAX_SHARED;
 	}
 	page->share++;
 }
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b3a76710487..21b355999ce0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -666,13 +666,14 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
 #define PAGE_MAPPING_ANON	0x1
 #define PAGE_MAPPING_MOVABLE	0x2
 #define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
-#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
+/* to be removed once typical page refcounting for dax proves stable */
+#define PAGE_MAPPING_DAX_SHARED	0x4
+#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE | PAGE_MAPPING_DAX_SHARED)
 
 /*
  * Different with flags above, this flag is used only for fsdax mode.  It
  * indicates that this page->mapping is now under reflink case.
  */
-#define PAGE_MAPPING_DAX_SHARED	((void *)0x1)
 
 static __always_inline bool folio_mapping_flags(const struct folio *folio)
 {
@@ -689,6 +690,11 @@ static __always_inline bool folio_test_anon(const struct folio *folio)
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
 }
 
+static __always_inline bool folio_test_dax_shared(const struct folio *folio)
+{
+	return ((unsigned long)folio->mapping & PAGE_MAPPING_DAX_SHARED) != 0;
+}
+
 static __always_inline bool PageAnon(const struct page *page)
 {
 	return folio_test_anon(page_folio(page));
---

...and keep the validation around at least for one post conversion
development cycle?

> > It does have implications for the dax dma-idle tracking thought, see
> > below.
> >
> >>  {
> >> -	unsigned long pfn;
> >> +	unsigned long order = dax_entry_order(entry);
> >> +	struct folio *folio = dax_to_folio(entry);
> >>  
> >> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
> >> +	if (!dax_entry_size(entry))
> >>  		return;
> >>  
> >> -	for_each_mapped_pfn(entry, pfn) {
> >> -		struct page *page = pfn_to_page(pfn);
> >> -
> >> -		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
> >> -		if (dax_page_is_shared(page)) {
> >> -			/* keep the shared flag if this page is still shared */
> >> -			if (dax_page_share_put(page) > 0)
> >> -				continue;
> >> -		} else
> >> -			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
> >> -		page->mapping = NULL;
> >> -		page->index = 0;
> >> -	}
> >> +	/*
> >> +	 * We don't hold a reference for the DAX pagecache entry for the
> >> +	 * page. But we need to initialise the folio so we can hand it
> >> +	 * out. Nothing else should have a reference either.
> >> +	 */
> >> +	WARN_ON_ONCE(folio_ref_count(folio));
> >
> > Per above I would feel more comfortable if we kept the paranoia around
> > to ensure that all the pages in this folio have dropped all references
> > and cleared ->mapping and ->index.
> >
> > That paranoia can be placed behind a CONFIG_DEBUB_VM check, and we can
> > delete in a follow-on development cycle, but in the meantime it helps to
> > prove the correctness of the conversion.
> 
> I'm ok with paranoia, but as noted above the issue is that at a minimum
> page->mapping (and probably index) now needs to be valid for any code
> that might walk the page tables.

A quick look seems to say the confusion is limited to aliasing
PAGE_MAPPING_ANON.

> > [..]
> >> @@ -1189,11 +1165,14 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> >>  	struct inode *inode = iter->inode;
> >>  	unsigned long vaddr = vmf->address;
> >>  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
> >> +	struct page *page = pfn_t_to_page(pfn);
> >>  	vm_fault_t ret;
> >>  
> >>  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
> >>  
> >> -	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
> >> +	page_ref_inc(page);
> >> +	ret = dax_insert_pfn(vmf, pfn, false);
> >> +	put_page(page);
> >
> > Per above I think it is problematic to have pages live in the system
> > without a refcount.
> 
> I'm a bit confused by this - the pages have a reference taken on them
> when they are mapped. They only live in the system without a refcount
> when the mm considers them free (except for the bit between getting
> created in dax_associate_entry() and actually getting mapped but as
> noted I will fix that).
> 
> > One scenario where this might be needed is invalidate_inode_pages() vs
> > DMA. The invaldation should pause and wait for DMA pins to be dropped
> > before the mapping xarray is cleaned up and the dax folio is marked
> > free.
> 
> I'm not really following this scenario, or at least how it relates to
> the comment above. If the page is pinned for DMA it will have taken a
> refcount on it and so the page won't be considered free/idle per
> dax_wait_page_idle() or any of the other mm code.

[ tl;dr: I think we're ok, analysis below, but I did talk myself into
the proposed dax_busy_page() changes indeed being broken and needing to
remain checking for refcount > 1, not > 0 ]

It's not the mm code I am worried about. It's the filesystem block
allocator staying in-sync with the allocation state of the page.

fs/dax.c is charged with converting idle storage blocks to pfns to
mapped folios. Once they are mapped, DMA can pin the folio, but nothing
in fs/dax.c pins the mapping. In the pagecache case the page reference
is sufficient to keep the DMA-busy page from being reused. In the dax
case something needs to arrange for DMA to be idle before
dax_delete_mapping_entry().

However, looking at XFS it indeed makes that guarantee. First it does
xfs_break_dax_layouts() then it does truncate_inode_pages() =>
dax_delete_mapping_entry().

It follows that that the DMA-idle condition still needs to look for the
case where the refcount is > 1 rather than 0 since refcount == 1 is the
page-mapped-but-DMA-idle condition.

[..]
> >> @@ -1649,9 +1627,10 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
> >>  	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
> >>  	bool write = iter->flags & IOMAP_WRITE;
> >>  	unsigned long entry_flags = pmd ? DAX_PMD : 0;
> >> -	int err = 0;
> >> +	int ret, err = 0;
> >>  	pfn_t pfn;
> >>  	void *kaddr;
> >> +	struct page *page;
> >>  
> >>  	if (!pmd && vmf->cow_page)
> >>  		return dax_fault_cow_page(vmf, iter);
> >> @@ -1684,14 +1663,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
> >>  	if (dax_fault_is_synchronous(iter, vmf->vma))
> >>  		return dax_fault_synchronous_pfnp(pfnp, pfn);
> >>  
> >> -	/* insert PMD pfn */
> >> +	page = pfn_t_to_page(pfn);
> >
> > I think this is clearer if dax_insert_entry() returns folios with an
> > elevated refrence count that is dropped when the folio is invalidated
> > out of the mapping.
> 
> I presume this comment is for the next line:
> 
> +	page_ref_inc(page);
>  
> I can move that into dax_insert_entry(), but we would still need to
> drop it after calling vmf_insert_*() to ensure we get the 1 -> 0
> transition when the page is unmapped and therefore
> freed. Alternatively we can make it so vmf_insert_*() don't take
> references on the page, and instead ownership of the reference is
> transfered to the mapping. Personally I prefered having those
> functions take their own reference but let me know what you think.

Oh, the model I was thinking was that until vmf_insert_XXX() succeeds
then the page was never allocated because it was never mapped. What
happens with the code as proposed is that put_page() triggers page-free
semantics on vmf_insert_XXX() failures, right?

There is no need to invoke the page-free / final-put path on
vmf_insert_XXX() error because the storage-block / pfn never actually
transitioned into a page / folio.

> > [..]
> >> @@ -519,21 +529,3 @@ void zone_device_page_init(struct page *page)
> >>  	lock_page(page);
> >>  }
> >>  EXPORT_SYMBOL_GPL(zone_device_page_init);
> >> -
> >> -#ifdef CONFIG_FS_DAX
> >> -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
> >> -{
> >> -	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
> >> -		return false;
> >> -
> >> -	/*
> >> -	 * fsdax page refcounts are 1-based, rather than 0-based: if
> >> -	 * refcount is 1, then the page is free and the refcount is
> >> -	 * stable because nobody holds a reference on the page.
> >> -	 */
> >> -	if (folio_ref_sub_return(folio, refs) == 1)
> >> -		wake_up_var(&folio->_refcount);
> >> -	return true;
> >
> > It follow from the refcount disvussion above that I think there is an
> > argument to still keep this wakeup based on the 2->1 transitition.
> > pagecache pages are refcount==1 when they are dma-idle but still
> > allocated. To keep the same semantics for dax a dax_folio would have an
> > elevated refcount whenever it is referenced by mapping entry.
> 
> I'm not sold on keeping it as it doesn't seem to offer any benefit
> IMHO. I know both Jason and Christoph were keen to see it go so it be
> good to get their feedback too. Also one of the primary goals of this
> series was to refcount the page normally so we could remove the whole
> "page is free with a refcount of 1" semantics.

The page is still free at refcount 0, no argument there. But, by
introducing a new "page refcount is elevated while mapped" (as it
should), it follows that "page is DMA idle at refcount == 1", right?
Otherwise, the current assumption that fileystems can have
dax_layout_busy_page_range() poll on the state of the pfn in the mapping
is broken because page refcount == 0 also means no page to mapping
association.
Alistair Popple Oct. 25, 2024, 2:46 a.m. UTC | #4
Dan Williams <dan.j.williams@intel.com> writes:

> Alistair Popple wrote:
> [..]
>> >
>> > Was there a discussion I missed about why the conversion to typical
>> > folios allows the page->share accounting to be dropped.
>> 
>> The problem with keeping it is we now treat DAX pages as "normal"
>> pages according to vm_normal_page(). As such we use the normal paths
>> for unmapping pages.
>> 
>> Specifically page->share accounting relies on PAGE_MAPPING_DAX_SHARED
>> aka PAGE_MAPPING_ANON which causes folio_test_anon(), PageAnon(),
>> etc. to return true leading to all sorts of issues in at least the
>> unmap paths.
>
> Oh, I missed that PAGE_MAPPING_DAX_SHARED aliases with
> PAGE_MAPPING_ANON.
>
>> There hasn't been a previous discussion on this, but given this is
>> only used to print warnings it seemed easier to get rid of it. I
>> probably should have called that out more clearly in the commit
>> message though.
>> 
>> > I assume this is because the page->mapping validation was dropped, which
>> > I think might be useful to keep at least for one development cycle to
>> > make sure this conversion is not triggering any of the old warnings.
>> >
>> > Otherwise, the ->share field of 'struct page' can also be cleaned up.
>> 
>> Yes, we should also clean up the ->share field, unless you have an
>> alternate suggestion to solve the above issue.
>
> kmalloc mininimum alignment is 8, so there is room to do this?

Oh right, given the aliasing I had assumed there wasn't room.

> ---
> diff --git a/fs/dax.c b/fs/dax.c
> index c62acd2812f8..a70f081c32cb 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -322,7 +322,7 @@ static unsigned long dax_end_pfn(void *entry)
>  
>  static inline bool dax_page_is_shared(struct page *page)
>  {
> -	return page->mapping == PAGE_MAPPING_DAX_SHARED;
> +	return folio_test_dax_shared(page_folio(page));
>  }
>  
>  /*
> @@ -331,14 +331,14 @@ static inline bool dax_page_is_shared(struct page *page)
>   */
>  static inline void dax_page_share_get(struct page *page)
>  {
> -	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
> +	if (!dax_page_is_shared(page)) {
>  		/*
>  		 * Reset the index if the page was already mapped
>  		 * regularly before.
>  		 */
>  		if (page->mapping)
>  			page->share = 1;
> -		page->mapping = PAGE_MAPPING_DAX_SHARED;
> +		page->mapping = (void *)PAGE_MAPPING_DAX_SHARED;
>  	}
>  	page->share++;
>  }
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 1b3a76710487..21b355999ce0 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -666,13 +666,14 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
>  #define PAGE_MAPPING_ANON	0x1
>  #define PAGE_MAPPING_MOVABLE	0x2
>  #define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
> -#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
> +/* to be removed once typical page refcounting for dax proves stable */
> +#define PAGE_MAPPING_DAX_SHARED	0x4
> +#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE | PAGE_MAPPING_DAX_SHARED)
>  
>  /*
>   * Different with flags above, this flag is used only for fsdax mode.  It
>   * indicates that this page->mapping is now under reflink case.
>   */
> -#define PAGE_MAPPING_DAX_SHARED	((void *)0x1)
>  
>  static __always_inline bool folio_mapping_flags(const struct folio *folio)
>  {
> @@ -689,6 +690,11 @@ static __always_inline bool folio_test_anon(const struct folio *folio)
>  	return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
>  }
>  
> +static __always_inline bool folio_test_dax_shared(const struct folio *folio)
> +{
> +	return ((unsigned long)folio->mapping & PAGE_MAPPING_DAX_SHARED) != 0;
> +}
> +
>  static __always_inline bool PageAnon(const struct page *page)
>  {
>  	return folio_test_anon(page_folio(page));
> ---
>
> ...and keep the validation around at least for one post conversion
> development cycle?

Looks reasonable, will add that back for at least a development
cycle. In reality it will probably stay forever and I will add a comment
to the PAGE_MAPPING_DAX_SHARED definition saying it can be easily
removed if more flags are needed.

>> > It does have implications for the dax dma-idle tracking thought, see
>> > below.
>> >
>> >>  {
>> >> -	unsigned long pfn;
>> >> +	unsigned long order = dax_entry_order(entry);
>> >> +	struct folio *folio = dax_to_folio(entry);
>> >>  
>> >> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
>> >> +	if (!dax_entry_size(entry))
>> >>  		return;
>> >>  
>> >> -	for_each_mapped_pfn(entry, pfn) {
>> >> -		struct page *page = pfn_to_page(pfn);
>> >> -
>> >> -		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
>> >> -		if (dax_page_is_shared(page)) {
>> >> -			/* keep the shared flag if this page is still shared */
>> >> -			if (dax_page_share_put(page) > 0)
>> >> -				continue;
>> >> -		} else
>> >> -			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
>> >> -		page->mapping = NULL;
>> >> -		page->index = 0;
>> >> -	}
>> >> +	/*
>> >> +	 * We don't hold a reference for the DAX pagecache entry for the
>> >> +	 * page. But we need to initialise the folio so we can hand it
>> >> +	 * out. Nothing else should have a reference either.
>> >> +	 */
>> >> +	WARN_ON_ONCE(folio_ref_count(folio));
>> >
>> > Per above I would feel more comfortable if we kept the paranoia around
>> > to ensure that all the pages in this folio have dropped all references
>> > and cleared ->mapping and ->index.
>> >
>> > That paranoia can be placed behind a CONFIG_DEBUB_VM check, and we can
>> > delete in a follow-on development cycle, but in the meantime it helps to
>> > prove the correctness of the conversion.
>> 
>> I'm ok with paranoia, but as noted above the issue is that at a minimum
>> page->mapping (and probably index) now needs to be valid for any code
>> that might walk the page tables.
>
> A quick look seems to say the confusion is limited to aliasing
> PAGE_MAPPING_ANON.

Correct. Looks like we can solve that though.

>> > [..]
>> >> @@ -1189,11 +1165,14 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
>> >>  	struct inode *inode = iter->inode;
>> >>  	unsigned long vaddr = vmf->address;
>> >>  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
>> >> +	struct page *page = pfn_t_to_page(pfn);
>> >>  	vm_fault_t ret;
>> >>  
>> >>  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
>> >>  
>> >> -	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
>> >> +	page_ref_inc(page);
>> >> +	ret = dax_insert_pfn(vmf, pfn, false);
>> >> +	put_page(page);
>> >
>> > Per above I think it is problematic to have pages live in the system
>> > without a refcount.
>> 
>> I'm a bit confused by this - the pages have a reference taken on them
>> when they are mapped. They only live in the system without a refcount
>> when the mm considers them free (except for the bit between getting
>> created in dax_associate_entry() and actually getting mapped but as
>> noted I will fix that).
>> 
>> > One scenario where this might be needed is invalidate_inode_pages() vs
>> > DMA. The invaldation should pause and wait for DMA pins to be dropped
>> > before the mapping xarray is cleaned up and the dax folio is marked
>> > free.
>> 
>> I'm not really following this scenario, or at least how it relates to
>> the comment above. If the page is pinned for DMA it will have taken a
>> refcount on it and so the page won't be considered free/idle per
>> dax_wait_page_idle() or any of the other mm code.
>
> [ tl;dr: I think we're ok, analysis below, but I did talk myself into
> the proposed dax_busy_page() changes indeed being broken and needing to
> remain checking for refcount > 1, not > 0 ]
>
> It's not the mm code I am worried about. It's the filesystem block
> allocator staying in-sync with the allocation state of the page.
>
> fs/dax.c is charged with converting idle storage blocks to pfns to
> mapped folios. Once they are mapped, DMA can pin the folio, but nothing
> in fs/dax.c pins the mapping. In the pagecache case the page reference
> is sufficient to keep the DMA-busy page from being reused. In the dax
> case something needs to arrange for DMA to be idle before
> dax_delete_mapping_entry().

Ok. How does that work today? My current mental model is that something
has to call dax_layout_busy_page() whilst holding the correct locks to
prevent a new mapping being established prior to calling
dax_delete_mapping_entry(). Is that correct?

> However, looking at XFS it indeed makes that guarantee. First it does
> xfs_break_dax_layouts() then it does truncate_inode_pages() =>
> dax_delete_mapping_entry().
>
> It follows that that the DMA-idle condition still needs to look for the
> case where the refcount is > 1 rather than 0 since refcount == 1 is the
> page-mapped-but-DMA-idle condition.

Sorry, but I'm still not following this line of reasoning. If the
refcount == 1 the page is either mapped xor DMA-busy. So a refcount >= 1
is enough to conclude that the page cannot be reused because it is
either being accessed from userspace via a CPU mapping or from some
device DMA or some other in kernel user.

The current proposal is that dax_busy_page() returns true if refcount >=
1, and dax_wait_page_idle() will wait until the refcount ==
0. dax_busy_page() will try and force the refcount == 0 by unmapping it,
but obviously can't force other pinners to release their reference hence
the need to wait. Callers should already be holding locks to ensure new
mappings can't be established and hence can't become DMA-busy after the
unmap.

> [..]
>> >> @@ -1649,9 +1627,10 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>> >>  	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
>> >>  	bool write = iter->flags & IOMAP_WRITE;
>> >>  	unsigned long entry_flags = pmd ? DAX_PMD : 0;
>> >> -	int err = 0;
>> >> +	int ret, err = 0;
>> >>  	pfn_t pfn;
>> >>  	void *kaddr;
>> >> +	struct page *page;
>> >>  
>> >>  	if (!pmd && vmf->cow_page)
>> >>  		return dax_fault_cow_page(vmf, iter);
>> >> @@ -1684,14 +1663,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>> >>  	if (dax_fault_is_synchronous(iter, vmf->vma))
>> >>  		return dax_fault_synchronous_pfnp(pfnp, pfn);
>> >>  
>> >> -	/* insert PMD pfn */
>> >> +	page = pfn_t_to_page(pfn);
>> >
>> > I think this is clearer if dax_insert_entry() returns folios with an
>> > elevated refrence count that is dropped when the folio is invalidated
>> > out of the mapping.
>> 
>> I presume this comment is for the next line:
>> 
>> +	page_ref_inc(page);
>>  
>> I can move that into dax_insert_entry(), but we would still need to
>> drop it after calling vmf_insert_*() to ensure we get the 1 -> 0
>> transition when the page is unmapped and therefore
>> freed. Alternatively we can make it so vmf_insert_*() don't take
>> references on the page, and instead ownership of the reference is
>> transfered to the mapping. Personally I prefered having those
>> functions take their own reference but let me know what you think.
>
> Oh, the model I was thinking was that until vmf_insert_XXX() succeeds
> then the page was never allocated because it was never mapped. What
> happens with the code as proposed is that put_page() triggers page-free
> semantics on vmf_insert_XXX() failures, right?

Right. And actually that means I can't move the page_ref_inc(page) into
what will be called dax_create_folio(), because an entry may have been
created previously that had a failed vmf_insert_XXX() which will
therefore have a zero refcount folio associated with it.

But I think that model is wrong. I think the model needs to be the page
gets allocated when the entry is first created (ie. when
dax_create_folio() is called). A subsequent free (ether due to
vmf_insert_XXX() failing or the page being unmapped or becoming
DMA-idle) should then delete the entry.

I think that makes the semantics around dax_busy_page() nicer as well -
no need for the truncate to have a special path to call
dax_delete_mapping_entry().

> There is no need to invoke the page-free / final-put path on
> vmf_insert_XXX() error because the storage-block / pfn never actually
> transitioned into a page / folio.

It's not mapping a page/folio that transitions a pfn into a page/folio
it is the allocation of the folio that happens in dax_create_folio()
(aka. dax_associate_new_entry()). So we need to delete the entry (as
noted above I don't do that currently) if the insertion fails.

>> > [..]
>> >> @@ -519,21 +529,3 @@ void zone_device_page_init(struct page *page)
>> >>  	lock_page(page);
>> >>  }
>> >>  EXPORT_SYMBOL_GPL(zone_device_page_init);
>> >> -
>> >> -#ifdef CONFIG_FS_DAX
>> >> -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
>> >> -{
>> >> -	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
>> >> -		return false;
>> >> -
>> >> -	/*
>> >> -	 * fsdax page refcounts are 1-based, rather than 0-based: if
>> >> -	 * refcount is 1, then the page is free and the refcount is
>> >> -	 * stable because nobody holds a reference on the page.
>> >> -	 */
>> >> -	if (folio_ref_sub_return(folio, refs) == 1)
>> >> -		wake_up_var(&folio->_refcount);
>> >> -	return true;
>> >
>> > It follow from the refcount disvussion above that I think there is an
>> > argument to still keep this wakeup based on the 2->1 transitition.
>> > pagecache pages are refcount==1 when they are dma-idle but still
>> > allocated. To keep the same semantics for dax a dax_folio would have an
>> > elevated refcount whenever it is referenced by mapping entry.
>> 
>> I'm not sold on keeping it as it doesn't seem to offer any benefit
>> IMHO. I know both Jason and Christoph were keen to see it go so it be
>> good to get their feedback too. Also one of the primary goals of this
>> series was to refcount the page normally so we could remove the whole
>> "page is free with a refcount of 1" semantics.
>
> The page is still free at refcount 0, no argument there. But, by
> introducing a new "page refcount is elevated while mapped" (as it
> should), it follows that "page is DMA idle at refcount == 1", right?

No. The page is either mapped xor DMA-busy - ie. not free. If we want
(need?) to tell the difference we can use folio_maybe_dma_pinned(),
assuming the driver doing DMA has called pin_user_pages() as it should.

That said I'm not sure why we care about the distinction between
DMA-idle and mapped? If the page is not free from the mm perspective the
block can't be reallocated by the filesystem.

> Otherwise, the current assumption that fileystems can have
> dax_layout_busy_page_range() poll on the state of the pfn in the mapping
> is broken because page refcount == 0 also means no page to mapping
> association.

And also means nothing from the mm (userspace mapping, DMA-busy, etc.)
is using the page so the page isn't busy and is free to be reallocated
right?

 - Alistair
Dan Williams Oct. 25, 2024, 4:35 a.m. UTC | #5
Alistair Popple wrote:
[..]
>> I'm not really following this scenario, or at least how it relates to
> >> the comment above. If the page is pinned for DMA it will have taken a
> >> refcount on it and so the page won't be considered free/idle per
> >> dax_wait_page_idle() or any of the other mm code.
> >
> > [ tl;dr: I think we're ok, analysis below, but I did talk myself into
> > the proposed dax_busy_page() changes indeed being broken and needing to
> > remain checking for refcount > 1, not > 0 ]
> >
> > It's not the mm code I am worried about. It's the filesystem block
> > allocator staying in-sync with the allocation state of the page.
> >
> > fs/dax.c is charged with converting idle storage blocks to pfns to
> > mapped folios. Once they are mapped, DMA can pin the folio, but nothing
> > in fs/dax.c pins the mapping. In the pagecache case the page reference
> > is sufficient to keep the DMA-busy page from being reused. In the dax
> > case something needs to arrange for DMA to be idle before
> > dax_delete_mapping_entry().
> 
> Ok. How does that work today? My current mental model is that something
> has to call dax_layout_busy_page() whilst holding the correct locks to
> prevent a new mapping being established prior to calling
> dax_delete_mapping_entry(). Is that correct?

Correct. dax_delete_mapping_entry() is invoked by the filesystem with
inode locks held. See xfs_file_fallocate() where it takes the lock,
calls xfs_break_layouts() and if that succeeds performs
xfs_file_free_space() with the lock held.

xfs_file_free_space() triggers dax_delete_mapping_entry() with knowledge
that the mapping cannot be re-established until the lock is dropped.

> > However, looking at XFS it indeed makes that guarantee. First it does
> > xfs_break_dax_layouts() then it does truncate_inode_pages() =>
> > dax_delete_mapping_entry().
> >
> > It follows that that the DMA-idle condition still needs to look for the
> > case where the refcount is > 1 rather than 0 since refcount == 1 is the
> > page-mapped-but-DMA-idle condition.
> 
> Sorry, but I'm still not following this line of reasoning. If the
> refcount == 1 the page is either mapped xor DMA-busy.

No, my expectation is the refcount is 1 while the page has a mapping
entry, analagous to an idle / allocated page cache page, and the
refcount is 2 or more for DMA, get_user_pages(), or any page walker that
takes a transient page pin.

> is enough to conclude that the page cannot be reused because it is
> either being accessed from userspace via a CPU mapping or from some
> device DMA or some other in kernel user.

Userspace access is not a problem, that access can always be safely
revoked by unmapping the page, and that's what dax_layout_busy_page()
does to force a fault and re-taking the inode + mmap locks so that the
truncate path knows it has temporary exclusive access to the page, pfn,
and storage-block association.

> The current proposal is that dax_busy_page() returns true if refcount >=
> 1, and dax_wait_page_idle() will wait until the refcount ==
> 0. dax_busy_page() will try and force the refcount == 0 by unmapping it,
> but obviously can't force other pinners to release their reference hence
> the need to wait. Callers should already be holding locks to ensure new
> mappings can't be established and hence can't become DMA-busy after the
> unmap.

Am I missing a page_ref_dec() somewhere? Are you saying that
dax_layout_busy_page() will find entries with ->mapping non-NULL and
refcount == 0?

[..]
> >> >> @@ -1684,14 +1663,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
> >> >>  	if (dax_fault_is_synchronous(iter, vmf->vma))
> >> >>  		return dax_fault_synchronous_pfnp(pfnp, pfn);
> >> >>  
> >> >> -	/* insert PMD pfn */
> >> >> +	page = pfn_t_to_page(pfn);
> >> >
> >> > I think this is clearer if dax_insert_entry() returns folios with an
> >> > elevated refrence count that is dropped when the folio is invalidated
> >> > out of the mapping.
> >> 
> >> I presume this comment is for the next line:
> >> 
> >> +	page_ref_inc(page);
> >>  
> >> I can move that into dax_insert_entry(), but we would still need to
> >> drop it after calling vmf_insert_*() to ensure we get the 1 -> 0
> >> transition when the page is unmapped and therefore
> >> freed. Alternatively we can make it so vmf_insert_*() don't take
> >> references on the page, and instead ownership of the reference is
> >> transfered to the mapping. Personally I prefered having those
> >> functions take their own reference but let me know what you think.
> >
> > Oh, the model I was thinking was that until vmf_insert_XXX() succeeds
> > then the page was never allocated because it was never mapped. What
> > happens with the code as proposed is that put_page() triggers page-free
> > semantics on vmf_insert_XXX() failures, right?
> 
> Right. And actually that means I can't move the page_ref_inc(page) into
> what will be called dax_create_folio(), because an entry may have been
> created previously that had a failed vmf_insert_XXX() which will
> therefore have a zero refcount folio associated with it.

I would expect a full cleanup on on vmf_insert_XXX() failure, not
leaving a zero-referenced entry.

> But I think that model is wrong. I think the model needs to be the page
> gets allocated when the entry is first created (ie. when
> dax_create_folio() is called). A subsequent free (ether due to
> vmf_insert_XXX() failing or the page being unmapped or becoming
> DMA-idle) should then delete the entry.
>
> I think that makes the semantics around dax_busy_page() nicer as well -
> no need for the truncate to have a special path to call
> dax_delete_mapping_entry().

I agree it would be lovely if the final put could clean up the mapping
entry and not depend on truncate_inode_pages_range() to do that.

...but I do not immediately see how to get there when block, pfn, and
page are so tightly coupled with dax. That's a whole new project to
introduce that paradigm, no? The page cache case gets away with
it by safely disconnecting the pfn+page from the block and then letting
DMA final put_page() take its time.

> > There is no need to invoke the page-free / final-put path on
> > vmf_insert_XXX() error because the storage-block / pfn never actually
> > transitioned into a page / folio.
> 
> It's not mapping a page/folio that transitions a pfn into a page/folio
> it is the allocation of the folio that happens in dax_create_folio()
> (aka. dax_associate_new_entry()). So we need to delete the entry (as
> noted above I don't do that currently) if the insertion fails.

Yeah, deletion on insert failure makes sense.

[..]
> >> >> @@ -519,21 +529,3 @@ void zone_device_page_init(struct page *page)
> >> >>  	lock_page(page);
> >> >>  }
> >> >>  EXPORT_SYMBOL_GPL(zone_device_page_init);
> >> >> -
> >> >> -#ifdef CONFIG_FS_DAX
> >> >> -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
> >> >> -{
> >> >> -	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
> >> >> -		return false;
> >> >> -
> >> >> -	/*
> >> >> -	 * fsdax page refcounts are 1-based, rather than 0-based: if
> >> >> -	 * refcount is 1, then the page is free and the refcount is
> >> >> -	 * stable because nobody holds a reference on the page.
> >> >> -	 */
> >> >> -	if (folio_ref_sub_return(folio, refs) == 1)
> >> >> -		wake_up_var(&folio->_refcount);
> >> >> -	return true;
> >> >
> >> > It follow from the refcount disvussion above that I think there is an
> >> > argument to still keep this wakeup based on the 2->1 transitition.
> >> > pagecache pages are refcount==1 when they are dma-idle but still
> >> > allocated. To keep the same semantics for dax a dax_folio would have an
> >> > elevated refcount whenever it is referenced by mapping entry.
> >> 
> >> I'm not sold on keeping it as it doesn't seem to offer any benefit
> >> IMHO. I know both Jason and Christoph were keen to see it go so it be
> >> good to get their feedback too. Also one of the primary goals of this
> >> series was to refcount the page normally so we could remove the whole
> >> "page is free with a refcount of 1" semantics.
> >
> > The page is still free at refcount 0, no argument there. But, by
> > introducing a new "page refcount is elevated while mapped" (as it
> > should), it follows that "page is DMA idle at refcount == 1", right?
> 
> No. The page is either mapped xor DMA-busy - ie. not free. If we want
> (need?) to tell the difference we can use folio_maybe_dma_pinned(),
> assuming the driver doing DMA has called pin_user_pages() as it should.
> 
> That said I'm not sure why we care about the distinction between
> DMA-idle and mapped? If the page is not free from the mm perspective the
> block can't be reallocated by the filesystem.

"can't be reallocated", what enforces that in your view? I am hoping it
is something I am overlooking.

In my view the filesystem has no idea of this page-to-block
relationship. All it knows is that when it wants to destroy the
page-to-block association, dax notices and says "uh, oh, this is my last
chance to make sure the block can go back into the fs allocation pool so
I need to wait for the mm to say that the page is exclusive to me (dax
core) before dax_delete_mapping_entry() destroys the page-to-block
association and the fs reclaims the allocation".

> > Otherwise, the current assumption that fileystems can have
> > dax_layout_busy_page_range() poll on the state of the pfn in the mapping
> > is broken because page refcount == 0 also means no page to mapping
> > association.
> 
> And also means nothing from the mm (userspace mapping, DMA-busy, etc.)
> is using the page so the page isn't busy and is free to be reallocated
> right?

Lets take the 'map => start dma => truncate => end dma' scenario.

At the 'end dma' step, how does the filesystem learn that the block that
it truncated, potentially hours ago, is now a free block? The filesystem
thought it reclaimed the block when truncate completed. I.e. dax says,
thou shalt 'end dma' => 'truncate' in all cases.

Note "dma" can be replaced with "any non dax core page_ref".
Alistair Popple Oct. 28, 2024, 4:24 a.m. UTC | #6
Dan Williams <dan.j.williams@intel.com> writes:

> Alistair Popple wrote:
> [..]
>>> I'm not really following this scenario, or at least how it relates to
>> >> the comment above. If the page is pinned for DMA it will have taken a
>> >> refcount on it and so the page won't be considered free/idle per
>> >> dax_wait_page_idle() or any of the other mm code.
>> >
>> > [ tl;dr: I think we're ok, analysis below, but I did talk myself into
>> > the proposed dax_busy_page() changes indeed being broken and needing to
>> > remain checking for refcount > 1, not > 0 ]
>> >
>> > It's not the mm code I am worried about. It's the filesystem block
>> > allocator staying in-sync with the allocation state of the page.
>> >
>> > fs/dax.c is charged with converting idle storage blocks to pfns to
>> > mapped folios. Once they are mapped, DMA can pin the folio, but nothing
>> > in fs/dax.c pins the mapping. In the pagecache case the page reference
>> > is sufficient to keep the DMA-busy page from being reused. In the dax
>> > case something needs to arrange for DMA to be idle before
>> > dax_delete_mapping_entry().
>> 
>> Ok. How does that work today? My current mental model is that something
>> has to call dax_layout_busy_page() whilst holding the correct locks to
>> prevent a new mapping being established prior to calling
>> dax_delete_mapping_entry(). Is that correct?
>
> Correct. dax_delete_mapping_entry() is invoked by the filesystem with
> inode locks held. See xfs_file_fallocate() where it takes the lock,
> calls xfs_break_layouts() and if that succeeds performs
> xfs_file_free_space() with the lock held.

Thanks for confirming. I've broken it enough times during development of
this that I thought I was correct but the confirmation is nice.

> xfs_file_free_space() triggers dax_delete_mapping_entry() with knowledge
> that the mapping cannot be re-established until the lock is dropped.
>
>> > However, looking at XFS it indeed makes that guarantee. First it does
>> > xfs_break_dax_layouts() then it does truncate_inode_pages() =>
>> > dax_delete_mapping_entry().
>> >
>> > It follows that that the DMA-idle condition still needs to look for the
>> > case where the refcount is > 1 rather than 0 since refcount == 1 is the
>> > page-mapped-but-DMA-idle condition.
>> 
>> Sorry, but I'm still not following this line of reasoning. If the
>> refcount == 1 the page is either mapped xor DMA-busy.
>
> No, my expectation is the refcount is 1 while the page has a mapping
> entry, analagous to an idle / allocated page cache page, and the
> refcount is 2 or more for DMA, get_user_pages(), or any page walker that
> takes a transient page pin.

Argh, I think we may have been talking past each other. By "mapped" I
was thinking of folio_mapped() == true. Ie. page->mapcount >= 1 due to
having page table entries. I suspect you're talking about DAX page-cache
entries here?

The way the series currently works the DAX page-cache does not hold a
reference on the page. Whether or not that is a good idea (or even
valid/functionally correct) is a reasonable question and where I think
this discussion is heading (see below).

>> is enough to conclude that the page cannot be reused because it is
>> either being accessed from userspace via a CPU mapping or from some
>> device DMA or some other in kernel user.
>
> Userspace access is not a problem, that access can always be safely
> revoked by unmapping the page, and that's what dax_layout_busy_page()
> does to force a fault and re-taking the inode + mmap locks so that the
> truncate path knows it has temporary exclusive access to the page, pfn,
> and storage-block association.

Right.

>> The current proposal is that dax_busy_page() returns true if refcount >=
>> 1, and dax_wait_page_idle() will wait until the refcount ==
>> 0. dax_busy_page() will try and force the refcount == 0 by unmapping it,
>> but obviously can't force other pinners to release their reference hence
>> the need to wait. Callers should already be holding locks to ensure new
>> mappings can't be established and hence can't become DMA-busy after the
>> unmap.
>
> Am I missing a page_ref_dec() somewhere? Are you saying that
> dax_layout_busy_page() will find entries with ->mapping non-NULL and
> refcount == 0?

No, ->mapping gets set to NULL when the page is freed in
free_zone_device_folio() but I think the mapping->i_pages XArray will
still contain references to the page with a zero
refcount. Ie. truncate_inode_pages_range() will still find them and call
truncate_inode_pages_range().

> [..]
>> >> >> @@ -1684,14 +1663,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>> >> >>  	if (dax_fault_is_synchronous(iter, vmf->vma))
>> >> >>  		return dax_fault_synchronous_pfnp(pfnp, pfn);
>> >> >>  
>> >> >> -	/* insert PMD pfn */
>> >> >> +	page = pfn_t_to_page(pfn);
>> >> >
>> >> > I think this is clearer if dax_insert_entry() returns folios with an
>> >> > elevated refrence count that is dropped when the folio is invalidated
>> >> > out of the mapping.
>> >> 
>> >> I presume this comment is for the next line:
>> >> 
>> >> +	page_ref_inc(page);
>> >>  
>> >> I can move that into dax_insert_entry(), but we would still need to
>> >> drop it after calling vmf_insert_*() to ensure we get the 1 -> 0
>> >> transition when the page is unmapped and therefore
>> >> freed. Alternatively we can make it so vmf_insert_*() don't take
>> >> references on the page, and instead ownership of the reference is
>> >> transfered to the mapping. Personally I prefered having those
>> >> functions take their own reference but let me know what you think.
>> >
>> > Oh, the model I was thinking was that until vmf_insert_XXX() succeeds
>> > then the page was never allocated because it was never mapped. What
>> > happens with the code as proposed is that put_page() triggers page-free
>> > semantics on vmf_insert_XXX() failures, right?
>> 
>> Right. And actually that means I can't move the page_ref_inc(page) into
>> what will be called dax_create_folio(), because an entry may have been
>> created previously that had a failed vmf_insert_XXX() which will
>> therefore have a zero refcount folio associated with it.
>
> I would expect a full cleanup on on vmf_insert_XXX() failure, not
> leaving a zero-referenced entry.
>
>> But I think that model is wrong. I think the model needs to be the page
>> gets allocated when the entry is first created (ie. when
>> dax_create_folio() is called). A subsequent free (ether due to
>> vmf_insert_XXX() failing or the page being unmapped or becoming
>> DMA-idle) should then delete the entry.
>>
>> I think that makes the semantics around dax_busy_page() nicer as well -
>> no need for the truncate to have a special path to call
>> dax_delete_mapping_entry().
>
> I agree it would be lovely if the final put could clean up the mapping
> entry and not depend on truncate_inode_pages_range() to do that.

I think I'm understanding you better now, thanks for your patience. I
think the problem here is most filesystems tend to basically do the
following:

1. Call some fs-specific version of break_dax_layouts() which:
   a) unmaps all the pages from the page-tables via
      dax_layout_busy_page()
   b) waits for DMA[1] to complete by looking at page refcounts

2. Removes DAX page-cache entries by calling
   truncate_inode_pages_range() or some equivalent.

In this series this works because the DAX page-cache doesn't hold a page
reference nor does it call dax_delete_mapping_entry() on free - it
relies on the truncate code to do that. So I think I understand your
original comment now:

>> > It follows that that the DMA-idle condition still needs to look for the
>> > case where the refcount is > 1 rather than 0 since refcount == 1 is the
>> > page-mapped-but-DMA-idle condition.

Because if the DAX page-cache holds a reference the refcount won't go to
zero until dax_delete_mapping_entry() is called. However this interface
seems really strange to me - filesystems call
dax_layout_busy_page()/dax_wait_page_idle() to make sure both user-space
and DMA[1] have finished with the page, but not the DAX code which still
has references in it's page-cache.

Is there some reason for this? In order words why can't the interface to
the filesystem be something like calling dax_break_layouts() which
ensures everything, including core FS DAX code, has finished with the
page(s) in question? I don't see why that wouldn't work for at least
EXT4 and XFS (FUSE seemed a bit different but I haven't dug too deeply).

If we could do that dax_break_layouts() would essentially:
1. unmap userspace via eg. unmap_mapping_pages() to drive the refcount
   down.
2. delete the DAX page-cache entry to remove its refcount.
3. wait for DMA to complete by waiting for the refcount to hit zero.

The problem with the filesystem truncate code at the moment is steps 2
and 3 are reversed so step 3 has to wait for a refcount of 1 as you
pointed out previously. But does that matter? Are there ever cases when
a filesystem needs to wait for the page to be idle but maintain it's DAX
page-cache entry?

I may be missing something though, because I was having trouble getting
this scheme to actually work today.

[1] - Where "DMA" means any unknown page reference

> ...but I do not immediately see how to get there when block, pfn, and
> page are so tightly coupled with dax. That's a whole new project to
> introduce that paradigm, no? The page cache case gets away with
> it by safely disconnecting the pfn+page from the block and then letting
> DMA final put_page() take its time.

Oh of course, thanks for pointing out the difference there.

>> > There is no need to invoke the page-free / final-put path on
>> > vmf_insert_XXX() error because the storage-block / pfn never actually
>> > transitioned into a page / folio.
>> 
>> It's not mapping a page/folio that transitions a pfn into a page/folio
>> it is the allocation of the folio that happens in dax_create_folio()
>> (aka. dax_associate_new_entry()). So we need to delete the entry (as
>> noted above I don't do that currently) if the insertion fails.
>
> Yeah, deletion on insert failure makes sense.
>
> [..]
>> >> >> @@ -519,21 +529,3 @@ void zone_device_page_init(struct page *page)
>> >> >>  	lock_page(page);
>> >> >>  }
>> >> >>  EXPORT_SYMBOL_GPL(zone_device_page_init);
>> >> >> -
>> >> >> -#ifdef CONFIG_FS_DAX
>> >> >> -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
>> >> >> -{
>> >> >> -	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
>> >> >> -		return false;
>> >> >> -
>> >> >> -	/*
>> >> >> -	 * fsdax page refcounts are 1-based, rather than 0-based: if
>> >> >> -	 * refcount is 1, then the page is free and the refcount is
>> >> >> -	 * stable because nobody holds a reference on the page.
>> >> >> -	 */
>> >> >> -	if (folio_ref_sub_return(folio, refs) == 1)
>> >> >> -		wake_up_var(&folio->_refcount);
>> >> >> -	return true;
>> >> >
>> >> > It follow from the refcount disvussion above that I think there is an
>> >> > argument to still keep this wakeup based on the 2->1 transitition.
>> >> > pagecache pages are refcount==1 when they are dma-idle but still
>> >> > allocated. To keep the same semantics for dax a dax_folio would have an
>> >> > elevated refcount whenever it is referenced by mapping entry.
>> >> 
>> >> I'm not sold on keeping it as it doesn't seem to offer any benefit
>> >> IMHO. I know both Jason and Christoph were keen to see it go so it be
>> >> good to get their feedback too. Also one of the primary goals of this
>> >> series was to refcount the page normally so we could remove the whole
>> >> "page is free with a refcount of 1" semantics.
>> >
>> > The page is still free at refcount 0, no argument there. But, by
>> > introducing a new "page refcount is elevated while mapped" (as it
>> > should), it follows that "page is DMA idle at refcount == 1", right?
>> 
>> No. The page is either mapped xor DMA-busy - ie. not free. If we want
>> (need?) to tell the difference we can use folio_maybe_dma_pinned(),
>> assuming the driver doing DMA has called pin_user_pages() as it should.
>> 
>> That said I'm not sure why we care about the distinction between
>> DMA-idle and mapped? If the page is not free from the mm perspective the
>> block can't be reallocated by the filesystem.
>
> "can't be reallocated", what enforces that in your view? I am hoping it
> is something I am overlooking.
>
> In my view the filesystem has no idea of this page-to-block
> relationship. All it knows is that when it wants to destroy the
> page-to-block association, dax notices and says "uh, oh, this is my last
> chance to make sure the block can go back into the fs allocation pool so
> I need to wait for the mm to say that the page is exclusive to me (dax
> core) before dax_delete_mapping_entry() destroys the page-to-block
> association and the fs reclaims the allocation".
>
>> > Otherwise, the current assumption that fileystems can have
>> > dax_layout_busy_page_range() poll on the state of the pfn in the mapping
>> > is broken because page refcount == 0 also means no page to mapping
>> > association.
>> 
>> And also means nothing from the mm (userspace mapping, DMA-busy, etc.)
>> is using the page so the page isn't busy and is free to be reallocated
>> right?
>
> Lets take the 'map => start dma => truncate => end dma' scenario.
>
> At the 'end dma' step, how does the filesystem learn that the block that
> it truncated, potentially hours ago, is now a free block? The filesystem
> thought it reclaimed the block when truncate completed. I.e. dax says,
> thou shalt 'end dma' => 'truncate' in all cases.

Agreed, but I don't think I was suggesting we change that. I agree DAX
has to ensure 'end dma' happens before truncate completes.

> Note "dma" can be replaced with "any non dax core page_ref".
Dan Williams Oct. 29, 2024, 2:03 a.m. UTC | #7
Alistair Popple wrote:
[..]

> >> > It follows that that the DMA-idle condition still needs to look for the
> >> > case where the refcount is > 1 rather than 0 since refcount == 1 is the
> >> > page-mapped-but-DMA-idle condition.
> 
> Because if the DAX page-cache holds a reference the refcount won't go to
> zero until dax_delete_mapping_entry() is called. However this interface
> seems really strange to me - filesystems call
> dax_layout_busy_page()/dax_wait_page_idle() to make sure both user-space
> and DMA[1] have finished with the page, but not the DAX code which still
> has references in it's page-cache.

First, I appreciate the clarification that I was mixing up "mapped"
(elevated map count) with, for lack of a better term, "tracked" (mapping
entry valid).

So, to repeat back to you what I understand now, the proposal is to
attempt to allow _count==0 as the DMA idle condition, but still have the
final return of the block to the allocator (fs allocator) occur after
dax_delete_mapping_entry().

> Is there some reason for this? In order words why can't the interface to
> the filesystem be something like calling dax_break_layouts() which
> ensures everything, including core FS DAX code, has finished with the
> page(s) in question? I don't see why that wouldn't work for at least
> EXT4 and XFS (FUSE seemed a bit different but I haven't dug too deeply).
> 
> If we could do that dax_break_layouts() would essentially:
> 1. unmap userspace via eg. unmap_mapping_pages() to drive the refcount
>    down.

Am I missing where unmap_mapping_pages() drops the _count? I can see
where it drops _mapcount. I don't think that matters for the proposal,
but that's my last gap in tracking the proposed refcount model.

> 2. delete the DAX page-cache entry to remove its refcount.
> 3. wait for DMA to complete by waiting for the refcount to hit zero.
> 
> The problem with the filesystem truncate code at the moment is steps 2
> and 3 are reversed so step 3 has to wait for a refcount of 1 as you
> pointed out previously. But does that matter? Are there ever cases when
> a filesystem needs to wait for the page to be idle but maintain it's DAX
> page-cache entry?

No, not that I can think of. The filesystem just cares that the page was
seen as part of the file at some point and that it is holding locks to
keep the block associated with that page allocated to the file until it
can complete its operation.

I think what we are talking about is a pfn-state not a page state. If
the block-pfn-page lifecycle from allocation to free is deconstructed as:

    block free
    block allocated
    pfn untracked
    pfn tracked
    page free
    page busy
    page free
    pfn untracked
    block free

...then I can indeed see cases where there is pfn metadata live even
though the page is free.

So I think I was playing victim to the current implementation that
assumes that "pfn tracked" means the page is allocated and that
pfn_to_folio(pfn)->mapping is valid and not NULL.

All this to say I am at least on the same page as you that _count == 0
can be used as the page free state even if the pfn tracking goes through
delayed cleanup.

However, if vmf_insert_XXX is increasing _count then, per my
unmap_mapping_pages() question above, I think dax_wait_page_idle() needs
to call try_to_unmap() to drop that _count, right? Similar observation
for the memory_failure_dev_pagemap() path, I think that path only calls
unmap_mapping_range() not try_to_unmap() and leaves _count elevated.

Lastly walking through the code again I think this fix is valid today:

diff --git a/fs/dax.c b/fs/dax.c
index fcbe62bde685..48f2c85690e1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -660,7 +660,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
        pgoff_t end_idx;
        XA_STATE(xas, &mapping->i_pages, start_idx);
 
-       if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+       if (!dax_mapping(mapping))
                return NULL;
 
        /* If end == LLONG_MAX, all pages from start to till end of file */


...because unmap_mapping_pages() will mark the mapping as unmapped even
though there are "pfn tracked + page busy" entries to clean up.

Appreciate you grappling this with me!
Alistair Popple Oct. 30, 2024, 5:57 a.m. UTC | #8
Dan Williams <dan.j.williams@intel.com> writes:

> Alistair Popple wrote:
> [..]
>
>> >> > It follows that that the DMA-idle condition still needs to look for the
>> >> > case where the refcount is > 1 rather than 0 since refcount == 1 is the
>> >> > page-mapped-but-DMA-idle condition.
>> 
>> Because if the DAX page-cache holds a reference the refcount won't go to
>> zero until dax_delete_mapping_entry() is called. However this interface
>> seems really strange to me - filesystems call
>> dax_layout_busy_page()/dax_wait_page_idle() to make sure both user-space
>> and DMA[1] have finished with the page, but not the DAX code which still
>> has references in it's page-cache.
>
> First, I appreciate the clarification that I was mixing up "mapped"
> (elevated map count) with, for lack of a better term, "tracked" (mapping
> entry valid).
>
> So, to repeat back to you what I understand now, the proposal is to
> attempt to allow _count==0 as the DMA idle condition, but still have the
> final return of the block to the allocator (fs allocator) occur after
> dax_delete_mapping_entry().

Right, that is what I would like to achieve if possible. The outstanding
question I think is "should the DAX page-cache have a reference on the
page?". Or to use your terminology below "if a pfn is tracked should
pfn_to_page(pfn)->_refcount == 0 or 1?"

This version implements it as being zero because altering that requires
re-ordering all the existing filesystem and mm users of
dax_layout_busy_range() and dax_delete_mapping_entry(). Based on this
discussion though I'm beginning to think it probably should be one, but
I haven't been able to make that work yet.

>> Is there some reason for this? In order words why can't the interface to
>> the filesystem be something like calling dax_break_layouts() which
>> ensures everything, including core FS DAX code, has finished with the
>> page(s) in question? I don't see why that wouldn't work for at least
>> EXT4 and XFS (FUSE seemed a bit different but I haven't dug too deeply).
>> 
>> If we could do that dax_break_layouts() would essentially:
>> 1. unmap userspace via eg. unmap_mapping_pages() to drive the refcount
>>    down.
>
> Am I missing where unmap_mapping_pages() drops the _count? I can see
> where it drops _mapcount. I don't think that matters for the proposal,
> but that's my last gap in tracking the proposed refcount model.

It is suitably obtuse due to MMU_GATHER. unmap_mapping_pages() drops the
folio/page reference after flushing the TLB. Ie:

=> tlb_finish_mmu
    => tlb_flush_mmu
        => __tlb_batch_free_encoded_pages
            => free_pages_and_swap_cache
                => folios_put_refs

>> 2. delete the DAX page-cache entry to remove its refcount.
>> 3. wait for DMA to complete by waiting for the refcount to hit zero.
>> 
>> The problem with the filesystem truncate code at the moment is steps 2
>> and 3 are reversed so step 3 has to wait for a refcount of 1 as you
>> pointed out previously. But does that matter? Are there ever cases when
>> a filesystem needs to wait for the page to be idle but maintain it's DAX
>> page-cache entry?
>
> No, not that I can think of. The filesystem just cares that the page was
> seen as part of the file at some point and that it is holding locks to
> keep the block associated with that page allocated to the file until it
> can complete its operation.
>
> I think what we are talking about is a pfn-state not a page state. If
> the block-pfn-page lifecycle from allocation to free is deconstructed as:
>
>     block free
>     block allocated
>     pfn untracked
>     pfn tracked
>     page free
>     page busy
>     page free
>     pfn untracked
>     block free
>
> ...then I can indeed see cases where there is pfn metadata live even
> though the page is free.
>
> So I think I was playing victim to the current implementation that
> assumes that "pfn tracked" means the page is allocated and that
> pfn_to_folio(pfn)->mapping is valid and not NULL.
>
> All this to say I am at least on the same page as you that _count == 0
> can be used as the page free state even if the pfn tracking goes through
> delayed cleanup.

Great, and I like this terminology of pfn tracked, etc.

> However, if vmf_insert_XXX is increasing _count then, per my
> unmap_mapping_pages() question above, I think dax_wait_page_idle() needs
> to call try_to_unmap() to drop that _count, right?

At the moment filesystems open-code their own version of
XXXX_break_layouts() which typically calls dax_layout_busy_page()
followed by dax_wait_page_idle(). The former will call
unmap_mapping_range(), which for shared mappings I thought should be
sufficient to find and unmap all page table references (and therefore
folio/page _refcounts) based on the address space / index.

I think try_to_unmap() would only be neccessary if we only had the folio
and not the address space / index and therefore needed to find them from
the mm (not fs!) rmap.

> Similar observation for the memory_failure_dev_pagemap() path, I think
> that path only calls unmap_mapping_range() not try_to_unmap() and
> leaves _count elevated.

As noted above unmap_mapping_range() will drop the refcount whenever it
clears a pte/pmd mapping the folio and I think it should find all the
pte's mapping it.

> Lastly walking through the code again I think this fix is valid today:
>
> diff --git a/fs/dax.c b/fs/dax.c
> index fcbe62bde685..48f2c85690e1 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -660,7 +660,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
>         pgoff_t end_idx;
>         XA_STATE(xas, &mapping->i_pages, start_idx);
>  
> -       if (!dax_mapping(mapping) || !mapping_mapped(mapping))
> +       if (!dax_mapping(mapping))
>                 return NULL;
>  
>         /* If end == LLONG_MAX, all pages from start to till end of file */
>
>
> ...because unmap_mapping_pages() will mark the mapping as unmapped even
> though there are "pfn tracked + page busy" entries to clean up.

Yep, I noticed this today when I was trying to figure out why my
re-ordering of the unmap/wait/untrack pfn wasn't working as expected. It
still isn't for some other reason, and I'm still figuring out if the
above is correct/valid, but it is on my list of things to look more
closely at.

> Appreciate you grappling this with me!

Not at all! And thank you as well ... I feel like this has helped me a
lot in getting a slightly better understanding of the problems. Also
unless you react violently to anything I've said here I think I have
enough material to post (and perhaps even explain!) the next version of
this series.

 - Alistair
diff mbox series

Patch

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 9c1a729..4d3ddd1 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -126,11 +126,11 @@  static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+	pfn = phys_to_pfn_t(phys, 0);
 
 	dax_set_mapping(vmf, pfn, fault_size);
 
-	return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+	return dax_insert_pfn(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
 
 static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
@@ -169,11 +169,11 @@  static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+	pfn = phys_to_pfn_t(phys, 0);
 
 	dax_set_mapping(vmf, pfn, fault_size);
 
-	return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+	return dax_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
@@ -214,11 +214,11 @@  static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
+	pfn = phys_to_pfn_t(phys, 0);
 
 	dax_set_mapping(vmf, pfn, fault_size);
 
-	return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+	return dax_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
 #else
 static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index e16d1d4..57a94a6 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -257,7 +257,7 @@  EXPORT_SYMBOL_GPL(dax_holder_notify_failure);
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 {
-	if (unlikely(!dax_write_cache_enabled(dax_dev)))
+	if (unlikely(dax_dev && !dax_write_cache_enabled(dax_dev)))
 		return;
 
 	arch_wb_cache_pmem(addr, size);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 210fb77..451cd0f 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -513,7 +513,7 @@  static int pmem_attach_disk(struct device *dev,
 
 	pmem->disk = disk;
 	pmem->pgmap.owner = pmem;
-	pmem->pfn_flags = PFN_DEV;
+	pmem->pfn_flags = 0;
 	if (is_nd_pfn(dev)) {
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
@@ -522,7 +522,6 @@  static int pmem_attach_disk(struct device *dev,
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
 		pmem->pfn_pad = resource_size(res) -
 			range_len(&pmem->pgmap.range);
-		pmem->pfn_flags |= PFN_MAP;
 		bb_range = pmem->pgmap.range;
 		bb_range.start += pmem->data_offset;
 	} else if (pmem_should_map_pages(dev)) {
@@ -532,7 +531,6 @@  static int pmem_attach_disk(struct device *dev,
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
-		pmem->pfn_flags |= PFN_MAP;
 		bb_range = pmem->pgmap.range;
 	} else {
 		addr = devm_memremap(dev, pmem->phys_addr,
diff --git a/fs/dax.c b/fs/dax.c
index becb4a6..05f7b88 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -71,6 +71,11 @@  static unsigned long dax_to_pfn(void *entry)
 	return xa_to_value(entry) >> DAX_SHIFT;
 }
 
+static struct folio *dax_to_folio(void *entry)
+{
+	return page_folio(pfn_to_page(dax_to_pfn(entry)));
+}
+
 static void *dax_make_entry(pfn_t pfn, unsigned long flags)
 {
 	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
@@ -318,85 +323,58 @@  static unsigned long dax_end_pfn(void *entry)
  */
 #define for_each_mapped_pfn(entry, pfn) \
 	for (pfn = dax_to_pfn(entry); \
-			pfn < dax_end_pfn(entry); pfn++)
+		pfn < dax_end_pfn(entry); pfn++)
 
-static inline bool dax_page_is_shared(struct page *page)
+static void dax_device_folio_init(struct folio *folio, int order)
 {
-	return page->mapping == PAGE_MAPPING_DAX_SHARED;
-}
+	int orig_order = folio_order(folio);
+	int i;
 
-/*
- * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
- * refcount.
- */
-static inline void dax_page_share_get(struct page *page)
-{
-	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
-		/*
-		 * Reset the index if the page was already mapped
-		 * regularly before.
-		 */
-		if (page->mapping)
-			page->share = 1;
-		page->mapping = PAGE_MAPPING_DAX_SHARED;
-	}
-	page->share++;
-}
+	if (orig_order != order) {
+		struct dev_pagemap *pgmap = page_dev_pagemap(&folio->page);
 
-static inline unsigned long dax_page_share_put(struct page *page)
-{
-	return --page->share;
-}
+		for (i = 0; i < (1UL << orig_order); i++) {
+			struct page *page = folio_page(folio, i);
 
-/*
- * When it is called in dax_insert_entry(), the shared flag will indicate that
- * whether this entry is shared by multiple files.  If so, set the page->mapping
- * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
- */
-static void dax_associate_entry(void *entry, struct address_space *mapping,
-		struct vm_area_struct *vma, unsigned long address, bool shared)
-{
-	unsigned long size = dax_entry_size(entry), pfn, index;
-	int i = 0;
+			ClearPageHead(page);
+			clear_compound_head(page);
 
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return;
-
-	index = linear_page_index(vma, address & ~(size - 1));
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
+			/*
+			 * Reset pgmap which was over-written by
+			 * prep_compound_page().
+			 */
+			page_folio(page)->pgmap = pgmap;
 
-		if (shared) {
-			dax_page_share_get(page);
-		} else {
-			WARN_ON_ONCE(page->mapping);
-			page->mapping = mapping;
-			page->index = index + i++;
+			/* Make sure this isn't set to TAIL_MAPPING */
+			page->mapping = NULL;
 		}
 	}
+
+	if (order > 0) {
+		prep_compound_page(&folio->page, order);
+		if (order > 1)
+			INIT_LIST_HEAD(&folio->_deferred_list);
+	}
 }
 
-static void dax_disassociate_entry(void *entry, struct address_space *mapping,
-		bool trunc)
+static void dax_associate_new_entry(void *entry, struct address_space *mapping,
+				pgoff_t index)
 {
-	unsigned long pfn;
+	unsigned long order = dax_entry_order(entry);
+	struct folio *folio = dax_to_folio(entry);
 
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+	if (!dax_entry_size(entry))
 		return;
 
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
-		if (dax_page_is_shared(page)) {
-			/* keep the shared flag if this page is still shared */
-			if (dax_page_share_put(page) > 0)
-				continue;
-		} else
-			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
-		page->mapping = NULL;
-		page->index = 0;
-	}
+	/*
+	 * We don't hold a reference for the DAX pagecache entry for the
+	 * page. But we need to initialise the folio so we can hand it
+	 * out. Nothing else should have a reference either.
+	 */
+	WARN_ON_ONCE(folio_ref_count(folio));
+	dax_device_folio_init(folio, order);
+	folio->mapping = mapping;
+	folio->index = index;
 }
 
 static struct page *dax_busy_page(void *entry)
@@ -406,7 +384,7 @@  static struct page *dax_busy_page(void *entry)
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
-		if (page_ref_count(page) > 1)
+		if (page_ref_count(page))
 			return page;
 	}
 	return NULL;
@@ -620,7 +598,6 @@  static void *grab_mapping_entry(struct xa_state *xas,
 			xas_lock_irq(xas);
 		}
 
-		dax_disassociate_entry(entry, mapping, false);
 		xas_store(xas, NULL);	/* undo the PMD join */
 		dax_wake_entry(xas, entry, WAKE_ALL);
 		mapping->nrpages -= PG_PMD_NR;
@@ -743,7 +720,7 @@  struct page *dax_layout_busy_page(struct address_space *mapping)
 EXPORT_SYMBOL_GPL(dax_layout_busy_page);
 
 static int __dax_invalidate_entry(struct address_space *mapping,
-					  pgoff_t index, bool trunc)
+				  pgoff_t index, bool trunc)
 {
 	XA_STATE(xas, &mapping->i_pages, index);
 	int ret = 0;
@@ -757,7 +734,6 @@  static int __dax_invalidate_entry(struct address_space *mapping,
 	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
 	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
 		goto out;
-	dax_disassociate_entry(entry, mapping, trunc);
 	xas_store(&xas, NULL);
 	mapping->nrpages -= 1UL << dax_entry_order(entry);
 	ret = 1;
@@ -894,9 +870,11 @@  static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 	if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 		void *old;
 
-		dax_disassociate_entry(entry, mapping, false);
-		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
-				shared);
+		if (!shared) {
+			dax_associate_new_entry(new_entry, mapping,
+				linear_page_index(vmf->vma, vmf->address));
+		}
+
 		/*
 		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
@@ -1084,9 +1062,7 @@  static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
 		goto out;
 	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
 		goto out;
-	/* For larger pages we need devmap */
-	if (length > 1 && !pfn_t_devmap(*pfnp))
-		goto out;
+
 	rc = 0;
 
 out_check_addr:
@@ -1189,11 +1165,14 @@  static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 	struct inode *inode = iter->inode;
 	unsigned long vaddr = vmf->address;
 	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
+	struct page *page = pfn_t_to_page(pfn);
 	vm_fault_t ret;
 
 	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
 
-	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
+	page_ref_inc(page);
+	ret = dax_insert_pfn(vmf, pfn, false);
+	put_page(page);
 	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
 }
@@ -1212,8 +1191,13 @@  static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 	pmd_t pmd_entry;
 	pfn_t pfn;
 
-	zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
+	if (arch_needs_pgtable_deposit()) {
+		pgtable = pte_alloc_one(vma->vm_mm);
+		if (!pgtable)
+			return VM_FAULT_OOM;
+	}
 
+	zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
 	if (unlikely(!zero_folio))
 		goto fallback;
 
@@ -1221,29 +1205,23 @@  static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
 				  DAX_PMD | DAX_ZERO_PAGE);
 
-	if (arch_needs_pgtable_deposit()) {
-		pgtable = pte_alloc_one(vma->vm_mm);
-		if (!pgtable)
-			return VM_FAULT_OOM;
-	}
-
 	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
-	if (!pmd_none(*(vmf->pmd))) {
-		spin_unlock(ptl);
-		goto fallback;
-	}
+	if (!pmd_none(*vmf->pmd))
+		goto fallback_unlock;
 
-	if (pgtable) {
-		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
-		mm_inc_nr_ptes(vma->vm_mm);
-	}
-	pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
+	pmd_entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
-	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
+	if (pgtable)
+		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+	set_pmd_at(vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 	spin_unlock(ptl);
 	trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
 	return VM_FAULT_NOPAGE;
 
+fallback_unlock:
+	spin_unlock(ptl);
+	mm_put_huge_zero_folio(vma->vm_mm);
+
 fallback:
 	if (pgtable)
 		pte_free(vma->vm_mm, pgtable);
@@ -1649,9 +1627,10 @@  static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
 	bool write = iter->flags & IOMAP_WRITE;
 	unsigned long entry_flags = pmd ? DAX_PMD : 0;
-	int err = 0;
+	int ret, err = 0;
 	pfn_t pfn;
 	void *kaddr;
+	struct page *page;
 
 	if (!pmd && vmf->cow_page)
 		return dax_fault_cow_page(vmf, iter);
@@ -1684,14 +1663,21 @@  static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 	if (dax_fault_is_synchronous(iter, vmf->vma))
 		return dax_fault_synchronous_pfnp(pfnp, pfn);
 
-	/* insert PMD pfn */
+	page = pfn_t_to_page(pfn);
+	page_ref_inc(page);
+
 	if (pmd)
-		return vmf_insert_pfn_pmd(vmf, pfn, write);
+		ret = dax_insert_pfn_pmd(vmf, pfn, write);
+	else
+		ret = dax_insert_pfn(vmf, pfn, write);
 
-	/* insert PTE pfn */
-	if (write)
-		return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
-	return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+	/*
+	 * Insert PMD/PTE will have a reference on the page when mapping it so
+	 * drop ours.
+	 */
+	put_page(page);
+
+	return ret;
 }
 
 static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
@@ -1932,6 +1918,7 @@  dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
 	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
 	void *entry;
 	vm_fault_t ret;
+	struct page *page;
 
 	xas_lock_irq(&xas);
 	entry = get_unlocked_entry(&xas, order);
@@ -1947,14 +1934,17 @@  dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
 	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
 	dax_lock_entry(&xas, entry);
 	xas_unlock_irq(&xas);
+	page = pfn_t_to_page(pfn);
+	page_ref_inc(page);
 	if (order == 0)
-		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+		ret = dax_insert_pfn(vmf, pfn, true);
 #ifdef CONFIG_FS_DAX_PMD
 	else if (order == PMD_ORDER)
-		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
+		ret = dax_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
 #endif
 	else
 		ret = VM_FAULT_FALLBACK;
+	put_page(page);
 	dax_unlock_entry(&xas, entry);
 	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
 	return ret;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index dd52601..f79a94d 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -875,8 +875,7 @@  static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (kaddr)
 		*kaddr = fs->window_kaddr + offset;
 	if (pfn)
-		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
-					PFN_DEV | PFN_MAP);
+		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0);
 	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
 }
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 773dfc4..0f6f355 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -217,8 +217,12 @@  static inline int dax_wait_page_idle(struct page *page,
 				void (cb)(struct inode *),
 				struct inode *inode)
 {
-	return ___wait_var_event(page, page_ref_count(page) == 1,
+	int ret;
+
+	ret = ___wait_var_event(page, !page_ref_count(page),
 				TASK_INTERRUPTIBLE, 0, 0, cb(inode));
+
+	return ret;
 }
 
 #if IS_ENABLED(CONFIG_DAX)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 935e493..592b992 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1071,6 +1071,8 @@  int vma_is_stack_for_current(struct vm_area_struct *vma);
 struct mmu_gather;
 struct inode;
 
+extern void prep_compound_page(struct page *page, unsigned int order);
+
 /*
  * compound_order() can be called without holding a reference, which means
  * that niceties like page_folio() don't work.  These callers should be
@@ -1394,25 +1396,6 @@  vm_fault_t finish_fault(struct vm_fault *vmf);
  *   back into memory.
  */
 
-#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
-DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
-static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
-	if (!static_branch_unlikely(&devmap_managed_key))
-		return false;
-	if (!folio_is_zone_device(folio))
-		return false;
-	return __put_devmap_managed_folio_refs(folio, refs);
-}
-#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
-	return false;
-}
-#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-
 /* 127: arbitrary random number, small enough to assemble well */
 #define folio_ref_zero_or_close_to_overflow(folio) \
 	((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1527,12 +1510,6 @@  static inline void put_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
 
-	/*
-	 * For some devmap managed pages we need to catch refcount transition
-	 * from 2 to 1:
-	 */
-	if (put_devmap_managed_folio_refs(folio, 1))
-		return;
 	folio_put(folio);
 }
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2175ebc..0326a41 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -667,12 +667,6 @@  PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
 #define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 
-/*
- * Different with flags above, this flag is used only for fsdax mode.  It
- * indicates that this page->mapping is now under reflink case.
- */
-#define PAGE_MAPPING_DAX_SHARED	((void *)0x1)
-
 static __always_inline bool folio_mapping_flags(const struct folio *folio)
 {
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
diff --git a/mm/gup.c b/mm/gup.c
index 5d2fc9a..798c92b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -91,8 +91,7 @@  static inline struct folio *try_get_folio(struct page *page, int refs)
 	 * belongs to this folio.
 	 */
 	if (unlikely(page_folio(page) != folio)) {
-		if (!put_devmap_managed_folio_refs(folio, refs))
-			folio_put_refs(folio, refs);
+		folio_put_refs(folio, refs);
 		goto retry;
 	}
 
@@ -111,8 +110,7 @@  static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
 			refs *= GUP_PIN_COUNTING_BIAS;
 	}
 
-	if (!put_devmap_managed_folio_refs(folio, refs))
-		folio_put_refs(folio, refs);
+	folio_put_refs(folio, refs);
 }
 
 /**
@@ -543,8 +541,7 @@  static struct folio *try_grab_folio_fast(struct page *page, int refs,
 	 */
 	if (unlikely((flags & FOLL_LONGTERM) &&
 		     !folio_is_longterm_pinnable(folio))) {
-		if (!put_devmap_managed_folio_refs(folio, refs))
-			folio_put_refs(folio, refs);
+		folio_put_refs(folio, refs);
 		return NULL;
 	}
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 790041e..ab2cd4e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2017,7 +2017,7 @@  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 						tlb->fullmm);
 	arch_check_zapped_pmd(vma, orig_pmd);
 	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-	if (vma_is_special_huge(vma)) {
+	if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
 		if (arch_needs_pgtable_deposit())
 			zap_deposited_table(tlb->mm, pmd);
 		spin_unlock(ptl);
@@ -2661,13 +2661,15 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 */
 		if (arch_needs_pgtable_deposit())
 			zap_deposited_table(mm, pmd);
-		if (vma_is_special_huge(vma))
+		if (!vma_is_dax(vma) && vma_is_special_huge(vma))
 			return;
 		if (unlikely(is_pmd_migration_entry(old_pmd))) {
 			swp_entry_t entry;
 
 			entry = pmd_to_swp_entry(old_pmd);
 			folio = pfn_swap_entry_folio(entry);
+		} else if (is_huge_zero_pmd(old_pmd)) {
+			return;
 		} else {
 			page = pmd_page(old_pmd);
 			folio = page_folio(page);
diff --git a/mm/internal.h b/mm/internal.h
index b00ea45..08123c2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -680,8 +680,6 @@  static inline void prep_compound_tail(struct page *head, int tail_idx)
 	set_page_private(p, 0);
 }
 
-extern void prep_compound_page(struct page *page, unsigned int order);
-
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern bool free_pages_prepare(struct page *page, unsigned int order);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 96ce31e..80dd2a7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -419,18 +419,18 @@  static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
 	pud = pud_offset(p4d, address);
 	if (!pud_present(*pud))
 		return 0;
-	if (pud_devmap(*pud))
+	if (pud_trans_huge(*pud))
 		return PUD_SHIFT;
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
 		return 0;
-	if (pmd_devmap(*pmd))
+	if (pmd_trans_huge(*pmd))
 		return PMD_SHIFT;
 	pte = pte_offset_map(pmd, address);
 	if (!pte)
 		return 0;
 	ptent = ptep_get(pte);
-	if (pte_present(ptent) && pte_devmap(ptent))
+	if (pte_present(ptent))
 		ret = PAGE_SHIFT;
 	pte_unmap(pte);
 	return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 368e15d..cc692d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3752,13 +3752,15 @@  static vm_fault_t do_wp_page(struct vm_fault *vmf)
 	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
 		/*
 		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
-		 * VM_PFNMAP VMA.
+		 * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
 		 *
 		 * We should not cow pages in a shared writeable mapping.
 		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
 		 */
-		if (!vmf->page)
+		if (!vmf->page || is_device_dax_page(vmf->page)) {
+			vmf->page = NULL;
 			return wp_pfn_shared(vmf);
+		}
 		return wp_page_shared(vmf, folio);
 	}
 
diff --git a/mm/memremap.c b/mm/memremap.c
index e885bc9..89c0c3b 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -458,8 +458,13 @@  EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
 void free_zone_device_folio(struct folio *folio)
 {
-	if (WARN_ON_ONCE(!folio->pgmap->ops ||
-			!folio->pgmap->ops->page_free))
+	struct dev_pagemap *pgmap = folio->pgmap;
+
+	if (WARN_ON_ONCE(!pgmap->ops))
+		return;
+
+	if (WARN_ON_ONCE(pgmap->type != MEMORY_DEVICE_FS_DAX &&
+			 !pgmap->ops->page_free))
 		return;
 
 	mem_cgroup_uncharge(folio);
@@ -486,24 +491,29 @@  void free_zone_device_folio(struct folio *folio)
 	 * to clear folio->mapping.
 	 */
 	folio->mapping = NULL;
-	folio->pgmap->ops->page_free(folio_page(folio, 0));
 
-	switch (folio->pgmap->type) {
+	switch (pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
 	case MEMORY_DEVICE_COHERENT:
-		put_dev_pagemap(folio->pgmap);
+		pgmap->ops->page_free(folio_page(folio, 0));
+		put_dev_pagemap(pgmap);
 		break;
 
-	case MEMORY_DEVICE_FS_DAX:
 	case MEMORY_DEVICE_GENERIC:
 		/*
 		 * Reset the refcount to 1 to prepare for handing out the page
 		 * again.
 		 */
+		pgmap->ops->page_free(folio_page(folio, 0));
 		folio_set_count(folio, 1);
 		break;
 
+	case MEMORY_DEVICE_FS_DAX:
+		wake_up_var(&folio->page);
+		break;
+
 	case MEMORY_DEVICE_PCI_P2PDMA:
+		pgmap->ops->page_free(folio_page(folio, 0));
 		break;
 	}
 }
@@ -519,21 +529,3 @@  void zone_device_page_init(struct page *page)
 	lock_page(page);
 }
 EXPORT_SYMBOL_GPL(zone_device_page_init);
-
-#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
-	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
-		return false;
-
-	/*
-	 * fsdax page refcounts are 1-based, rather than 0-based: if
-	 * refcount is 1, then the page is free and the refcount is
-	 * stable because nobody holds a reference on the page.
-	 */
-	if (folio_ref_sub_return(folio, refs) == 1)
-		wake_up_var(&folio->_refcount);
-	return true;
-}
-EXPORT_SYMBOL(__put_devmap_managed_folio_refs);
-#endif /* CONFIG_FS_DAX */
diff --git a/mm/mlock.c b/mm/mlock.c
index e3e3dc2..5352b00 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -362,6 +362,8 @@  static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
 	unsigned long start = addr;
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (vma_is_dax(vma))
+		ptl = NULL;
 	if (ptl) {
 		if (!pmd_present(*pmd))
 			goto out;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 3d0611e..3c32190 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1015,23 +1015,22 @@  static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
 	}
 
 	/*
-	 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC and
-	 * MEMORY_TYPE_FS_DAX pages are released directly to the driver page
-	 * allocator which will set the page count to 1 when allocating the
-	 * page.
+	 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
+	 * directly to the driver page allocator which will set the page count
+	 * to 1 when allocating the page.
 	 *
 	 * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
 	 * their refcount reset to one whenever they are freed (ie. after
 	 * their refcount drops to 0).
 	 */
 	switch (pgmap->type) {
+	case MEMORY_DEVICE_FS_DAX:
 	case MEMORY_DEVICE_PRIVATE:
 	case MEMORY_DEVICE_COHERENT:
 	case MEMORY_DEVICE_PCI_P2PDMA:
 		set_page_count(page, 0);
 		break;
 
-	case MEMORY_DEVICE_FS_DAX:
 	case MEMORY_DEVICE_GENERIC:
 		break;
 	}
diff --git a/mm/swap.c b/mm/swap.c
index 6b83898..0b90b61 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -969,8 +969,6 @@  void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 				unlock_page_lruvec_irqrestore(lruvec, flags);
 				lruvec = NULL;
 			}
-			if (put_devmap_managed_folio_refs(folio, nr_refs))
-				continue;
 			if (folio_ref_sub_and_test(folio, nr_refs))
 				free_zone_device_folio(folio);
 			continue;