diff mbox series

[v6,RESED,1/2] dma: replace zone_dma_bits by zone_dma_limit

Message ID 17c067618b93e5d71f19c37826d54db4299621a3.1723359916.git.baruch@tkos.co.il (mailing list archive)
State Handled Elsewhere
Headers show
Series dma: support DMA zone starting above 4GB | expand

Commit Message

Baruch Siach Aug. 11, 2024, 7:09 a.m. UTC
From: Catalin Marinas <catalin.marinas@arm.com>

Hardware DMA limit might not be power of 2. When RAM range starts above
0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
can not encode this limit.

Use plain address for DMA zone limit.

Since DMA zone can now potentially span beyond 4GB physical limit of
DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Co-developed-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
---
 arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
 arch/powerpc/mm/mem.c      |  5 ++++-
 arch/s390/mm/init.c        |  2 +-
 include/linux/dma-direct.h |  2 +-
 kernel/dma/direct.c        |  6 +++---
 kernel/dma/pool.c          |  4 ++--
 kernel/dma/swiotlb.c       |  6 +++---
 7 files changed, 29 insertions(+), 26 deletions(-)

Comments

Petr Tesarik Aug. 12, 2024, 5:52 a.m. UTC | #1
On Sun, 11 Aug 2024 10:09:35 +0300
Baruch Siach <baruch@tkos.co.il> wrote:

> From: Catalin Marinas <catalin.marinas@arm.com>
> 
> Hardware DMA limit might not be power of 2. When RAM range starts above
> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
> can not encode this limit.
> 
> Use plain address for DMA zone limit.
> 
> Since DMA zone can now potentially span beyond 4GB physical limit of
> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
> 
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
> Signed-off-by: Baruch Siach <baruch@tkos.co.il>

LGTM.

Reviewed-by: Petr Tesarik <ptesarik@suse.com>

Petr T

> ---
>  arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
>  arch/powerpc/mm/mem.c      |  5 ++++-
>  arch/s390/mm/init.c        |  2 +-
>  include/linux/dma-direct.h |  2 +-
>  kernel/dma/direct.c        |  6 +++---
>  kernel/dma/pool.c          |  4 ++--
>  kernel/dma/swiotlb.c       |  6 +++---
>  7 files changed, 29 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 9b5ab6818f7f..c45e2152ca9e 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void)
>  }
>  
>  /*
> - * Return the maximum physical address for a zone accessible by the given bits
> - * limit. If DRAM starts above 32-bit, expand the zone to the maximum
> + * Return the maximum physical address for a zone given its limit.
> + * If DRAM starts above 32-bit, expand the zone to the maximum
>   * available memory, otherwise cap it at 32-bit.
>   */
> -static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
> +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
>  {
> -	phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
>  	phys_addr_t phys_start = memblock_start_of_DRAM();
>  
>  	if (phys_start > U32_MAX)
> -		zone_mask = PHYS_ADDR_MAX;
> -	else if (phys_start > zone_mask)
> -		zone_mask = U32_MAX;
> +		zone_limit = PHYS_ADDR_MAX;
> +	else if (phys_start > zone_limit)
> +		zone_limit = U32_MAX;
>  
> -	return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
> +	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
>  }
>  
>  static void __init zone_sizes_init(void)
>  {
>  	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
> -	unsigned int __maybe_unused acpi_zone_dma_bits;
> -	unsigned int __maybe_unused dt_zone_dma_bits;
> -	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
> +	phys_addr_t __maybe_unused acpi_zone_dma_limit;
> +	phys_addr_t __maybe_unused dt_zone_dma_limit;
> +	phys_addr_t __maybe_unused dma32_phys_limit =
> +		max_zone_phys(DMA_BIT_MASK(32));
>  
>  #ifdef CONFIG_ZONE_DMA
> -	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
> -	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
> -	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
> -	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
> +	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
> +	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
> +	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
> +	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
>  	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
>  #endif
>  #ifdef CONFIG_ZONE_DMA32
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index d325217ab201..05b7f702b3f7 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void)
>   * everything else. GFP_DMA32 page allocations automatically fall back to
>   * ZONE_DMA.
>   *
> - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
> + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
>   * generic DMA mapping code.  32-bit only devices (if not handled by an IOMMU
>   * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
>   * ZONE_DMA.
> @@ -230,6 +230,7 @@ void __init paging_init(void)
>  {
>  	unsigned long long total_ram = memblock_phys_mem_size();
>  	phys_addr_t top_of_ram = memblock_end_of_DRAM();
> +	int zone_dma_bits;
>  
>  #ifdef CONFIG_HIGHMEM
>  	unsigned long v = __fix_to_virt(FIX_KMAP_END);
> @@ -256,6 +257,8 @@ void __init paging_init(void)
>  	else
>  		zone_dma_bits = 31;
>  
> +	zone_dma_limit = DMA_BIT_MASK(zone_dma_bits);
> +
>  #ifdef CONFIG_ZONE_DMA
>  	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn,
>  				      1UL << (zone_dma_bits - PAGE_SHIFT));
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index ddcd39ef4346..91fc2b91adfc 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -97,7 +97,7 @@ void __init paging_init(void)
>  
>  	vmem_map_init();
>  	sparse_init();
> -	zone_dma_bits = 31;
> +	zone_dma_limit = DMA_BIT_MASK(31);
>  	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
>  	max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
>  	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index edbe13d00776..d7e30d4f7503 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -12,7 +12,7 @@
>  #include <linux/mem_encrypt.h>
>  #include <linux/swiotlb.h>
>  
> -extern unsigned int zone_dma_bits;
> +extern u64 zone_dma_limit;
>  
>  /*
>   * Record the mapping of CPU physical to DMA addresses for a given region.
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 4480a3cd92e0..f2ba074a6a54 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -20,7 +20,7 @@
>   * it for entirely different regions. In that case the arch code needs to
>   * override the variable below for dma-direct to work properly.
>   */
> -unsigned int zone_dma_bits __ro_after_init = 24;
> +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
>  
>  static inline dma_addr_t phys_to_dma_direct(struct device *dev,
>  		phys_addr_t phys)
> @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
>  	 * zones.
>  	 */
>  	*phys_limit = dma_to_phys(dev, dma_limit);
> -	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
> +	if (*phys_limit <= zone_dma_limit)
>  		return GFP_DMA;
>  	if (*phys_limit <= DMA_BIT_MASK(32))
>  		return GFP_DMA32;
> @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
>  	 * part of the check.
>  	 */
>  	if (IS_ENABLED(CONFIG_ZONE_DMA))
> -		min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
> +		min_mask = min_t(u64, min_mask, zone_dma_limit);
>  	return mask >= phys_to_dma_unencrypted(dev, min_mask);
>  }
>  
> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index d10613eb0f63..7b04f7575796 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
>  	/* CMA can't cross zone boundaries, see cma_activate_area() */
>  	end = cma_get_base(cma) + size - 1;
>  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> -		return end <= DMA_BIT_MASK(zone_dma_bits);
> +		return end <= zone_dma_limit;
>  	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> -		return end <= DMA_BIT_MASK(32);
> +		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
>  	return true;
>  }
>  
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index df68d29740a0..abcf3fa63a56 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
>  	if (!remap)
>  		io_tlb_default_mem.can_grow = true;
>  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
> +		io_tlb_default_mem.phys_limit = zone_dma_limit;
>  	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
> +		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
>  	else
>  		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
>  #endif
> @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
>  	}
>  
>  	gfp &= ~GFP_ZONEMASK;
> -	if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
> +	if (phys_limit <= zone_dma_limit)
>  		gfp |= __GFP_DMA;
>  	else if (phys_limit <= DMA_BIT_MASK(32))
>  		gfp |= __GFP_DMA32;
Catalin Marinas Aug. 12, 2024, 11:22 a.m. UTC | #2
On Sun, Aug 11, 2024 at 10:09:35AM +0300, Baruch Siach wrote:
> From: Catalin Marinas <catalin.marinas@arm.com>
> 
> Hardware DMA limit might not be power of 2. When RAM range starts above
> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
> can not encode this limit.
> 
> Use plain address for DMA zone limit.
> 
> Since DMA zone can now potentially span beyond 4GB physical limit of
> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
> 
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
> Signed-off-by: Baruch Siach <baruch@tkos.co.il>

You might want to say that no functional change is expected with this
patch. The patch looks fine.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Will Deacon Aug. 16, 2024, 11:52 a.m. UTC | #3
On Sun, Aug 11, 2024 at 10:09:35AM +0300, Baruch Siach wrote:
> From: Catalin Marinas <catalin.marinas@arm.com>
> 
> Hardware DMA limit might not be power of 2. When RAM range starts above
> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
> can not encode this limit.
> 
> Use plain address for DMA zone limit.
> 
> Since DMA zone can now potentially span beyond 4GB physical limit of
> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
> 
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
> ---
>  arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
>  arch/powerpc/mm/mem.c      |  5 ++++-
>  arch/s390/mm/init.c        |  2 +-
>  include/linux/dma-direct.h |  2 +-
>  kernel/dma/direct.c        |  6 +++---
>  kernel/dma/pool.c          |  4 ++--
>  kernel/dma/swiotlb.c       |  6 +++---
>  7 files changed, 29 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 9b5ab6818f7f..c45e2152ca9e 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void)
>  }
>  
>  /*
> - * Return the maximum physical address for a zone accessible by the given bits
> - * limit. If DRAM starts above 32-bit, expand the zone to the maximum
> + * Return the maximum physical address for a zone given its limit.
> + * If DRAM starts above 32-bit, expand the zone to the maximum
>   * available memory, otherwise cap it at 32-bit.
>   */
> -static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
> +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
>  {
> -	phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
>  	phys_addr_t phys_start = memblock_start_of_DRAM();
>  
>  	if (phys_start > U32_MAX)
> -		zone_mask = PHYS_ADDR_MAX;
> -	else if (phys_start > zone_mask)
> -		zone_mask = U32_MAX;
> +		zone_limit = PHYS_ADDR_MAX;
> +	else if (phys_start > zone_limit)
> +		zone_limit = U32_MAX;
>  
> -	return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
> +	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;

Why do we need to adjust +-1 now that we're no longer using a mask?

>  }
>  
>  static void __init zone_sizes_init(void)
>  {
>  	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
> -	unsigned int __maybe_unused acpi_zone_dma_bits;
> -	unsigned int __maybe_unused dt_zone_dma_bits;
> -	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
> +	phys_addr_t __maybe_unused acpi_zone_dma_limit;
> +	phys_addr_t __maybe_unused dt_zone_dma_limit;
> +	phys_addr_t __maybe_unused dma32_phys_limit =
> +		max_zone_phys(DMA_BIT_MASK(32));
>  
>  #ifdef CONFIG_ZONE_DMA
> -	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
> -	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
> -	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
> -	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
> +	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
> +	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
> +	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
> +	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
>  	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
>  #endif

Maybe move this block into a helper function so we can avoid three
__maybe_unused variables?

Will
Petr Tesařík Aug. 16, 2024, 2:37 p.m. UTC | #4
On Fri, 16 Aug 2024 12:52:47 +0100
Will Deacon <will@kernel.org> wrote:

> On Sun, Aug 11, 2024 at 10:09:35AM +0300, Baruch Siach wrote:
> > From: Catalin Marinas <catalin.marinas@arm.com>
> > 
> > Hardware DMA limit might not be power of 2. When RAM range starts above
> > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
> > can not encode this limit.
> > 
> > Use plain address for DMA zone limit.
> > 
> > Since DMA zone can now potentially span beyond 4GB physical limit of
> > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
> > 
> > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> > Co-developed-by: Baruch Siach <baruch@tkos.co.il>
> > Signed-off-by: Baruch Siach <baruch@tkos.co.il>
> > ---
> >  arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
> >  arch/powerpc/mm/mem.c      |  5 ++++-
> >  arch/s390/mm/init.c        |  2 +-
> >  include/linux/dma-direct.h |  2 +-
> >  kernel/dma/direct.c        |  6 +++---
> >  kernel/dma/pool.c          |  4 ++--
> >  kernel/dma/swiotlb.c       |  6 +++---
> >  7 files changed, 29 insertions(+), 26 deletions(-)
> > 
> > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> > index 9b5ab6818f7f..c45e2152ca9e 100644
> > --- a/arch/arm64/mm/init.c
> > +++ b/arch/arm64/mm/init.c
> > @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void)
> >  }
> >  
> >  /*
> > - * Return the maximum physical address for a zone accessible by the given bits
> > - * limit. If DRAM starts above 32-bit, expand the zone to the maximum
> > + * Return the maximum physical address for a zone given its limit.
> > + * If DRAM starts above 32-bit, expand the zone to the maximum
> >   * available memory, otherwise cap it at 32-bit.
> >   */
> > -static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
> > +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
> >  {
> > -	phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
> >  	phys_addr_t phys_start = memblock_start_of_DRAM();
> >  
> >  	if (phys_start > U32_MAX)
> > -		zone_mask = PHYS_ADDR_MAX;
> > -	else if (phys_start > zone_mask)
> > -		zone_mask = U32_MAX;
> > +		zone_limit = PHYS_ADDR_MAX;
> > +	else if (phys_start > zone_limit)
> > +		zone_limit = U32_MAX;
> >  
> > -	return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
> > +	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;  
> 
> Why do we need to adjust +-1 now that we're no longer using a mask?

Subtracting 1 is needed to get the highest valid DRAM address so it can
be compared to the highest address in the zone (zone_limit). Adding 1
is necessary to get the lowest address beyond the zone.

AFAICT this is the right thing here:

	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);

max_zone_pfns[] max values are exclusive, i.e. the lowest PFN which is
_not_ within the zone.

It is also the right thing when arm64_dma_phys_limit is passed to
dma_contiguous_reserve().

It would be subtly broken if phys_addr_t could be a 32-bit integer, but
that's not possible on arm64.

In short, LGTM.

Petr T
Marek Szyprowski Aug. 26, 2024, 7:28 p.m. UTC | #5
Dear All,

On 11.08.2024 09:09, Baruch Siach wrote:
> From: Catalin Marinas <catalin.marinas@arm.com>
>
> Hardware DMA limit might not be power of 2. When RAM range starts above
> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
> can not encode this limit.
>
> Use plain address for DMA zone limit.
>
> Since DMA zone can now potentially span beyond 4GB physical limit of
> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
>
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
> ---

This patch landed recently in linux-next as commit ba0fb44aed47 
("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my 
tests I found that it introduces the following warning on ARM64/Rockchip 
based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts):

------------[ cut here ]------------
dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096 
overflow (mask ffffffff, bus limit 0).
WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308
Modules linked in:
CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278
Hardware name: Hardkernel ODROID-M1 (DT)
pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : swiotlb_map+0x2f0/0x308
lr : swiotlb_map+0x2f0/0x308
...
Call trace:
  swiotlb_map+0x2f0/0x308
  dma_direct_map_sg+0x9c/0x2e4
  __dma_map_sg_attrs+0x28/0x94
  dma_map_sg_attrs+0x10/0x24
  dw_mci_pre_dma_transfer+0xb8/0xf4
  dw_mci_pre_req+0x50/0x68
  mmc_blk_mq_issue_rq+0x3e0/0x964
  mmc_mq_queue_rq+0x118/0x2b4
  blk_mq_dispatch_rq_list+0x21c/0x714
  __blk_mq_sched_dispatch_requests+0x490/0x58c
  blk_mq_sched_dispatch_requests+0x30/0x6c
  blk_mq_run_hw_queue+0x284/0x40c
  blk_mq_flush_plug_list.part.0+0x190/0x974
  blk_mq_flush_plug_list+0x1c/0x2c
  __blk_flush_plug+0xe4/0x140
  blk_finish_plug+0x38/0x4c
  __ext4_get_inode_loc+0x22c/0x654
  __ext4_get_inode_loc_noinmem+0x40/0xa8
  __ext4_iget+0x154/0xcc0
  ext4_get_journal_inode+0x30/0x110
  ext4_load_and_init_journal+0x9c/0xaf0
  ext4_fill_super+0x1fec/0x2d90
  get_tree_bdev+0x140/0x1d8
  ext4_get_tree+0x18/0x24
  vfs_get_tree+0x28/0xe8
  path_mount+0x3e8/0xb7c
  init_mount+0x68/0xac
  do_mount_root+0x108/0x1dc
  mount_root_generic+0x100/0x330
  mount_root+0x160/0x2d0
  initrd_load+0x1f0/0x2a0
  prepare_namespace+0x4c/0x29c
  kernel_init_freeable+0x4b4/0x50c
  kernel_init+0x20/0x1d8
  ret_from_fork+0x10/0x20
irq event stamp: 1305682
hardirqs last  enabled at (1305681): [<ffff8000800e332c>] 
console_unlock+0x124/0x130
hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c
softirqs last  enabled at (1305678): [<ffff80008005be1c>] 
handle_softirqs+0x4cc/0x4e4
softirqs last disabled at (1305665): [<ffff8000800105b0>] 
__do_softirq+0x14/0x20
---[ end trace 0000000000000000 ]---

This "bus limit 0" seems to be a bit suspicious to me as well as the 
fact that swiotlb is used for the MMC DMA. I will investigate this 
further tomorrow. The board boots fine though.


>   arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
>   arch/powerpc/mm/mem.c      |  5 ++++-
>   arch/s390/mm/init.c        |  2 +-
>   include/linux/dma-direct.h |  2 +-
>   kernel/dma/direct.c        |  6 +++---
>   kernel/dma/pool.c          |  4 ++--
>   kernel/dma/swiotlb.c       |  6 +++---
>   7 files changed, 29 insertions(+), 26 deletions(-)
>
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 9b5ab6818f7f..c45e2152ca9e 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void)
>   }
>   
>   /*
> - * Return the maximum physical address for a zone accessible by the given bits
> - * limit. If DRAM starts above 32-bit, expand the zone to the maximum
> + * Return the maximum physical address for a zone given its limit.
> + * If DRAM starts above 32-bit, expand the zone to the maximum
>    * available memory, otherwise cap it at 32-bit.
>    */
> -static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
> +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
>   {
> -	phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
>   	phys_addr_t phys_start = memblock_start_of_DRAM();
>   
>   	if (phys_start > U32_MAX)
> -		zone_mask = PHYS_ADDR_MAX;
> -	else if (phys_start > zone_mask)
> -		zone_mask = U32_MAX;
> +		zone_limit = PHYS_ADDR_MAX;
> +	else if (phys_start > zone_limit)
> +		zone_limit = U32_MAX;
>   
> -	return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
> +	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
>   }
>   
>   static void __init zone_sizes_init(void)
>   {
>   	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
> -	unsigned int __maybe_unused acpi_zone_dma_bits;
> -	unsigned int __maybe_unused dt_zone_dma_bits;
> -	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
> +	phys_addr_t __maybe_unused acpi_zone_dma_limit;
> +	phys_addr_t __maybe_unused dt_zone_dma_limit;
> +	phys_addr_t __maybe_unused dma32_phys_limit =
> +		max_zone_phys(DMA_BIT_MASK(32));
>   
>   #ifdef CONFIG_ZONE_DMA
> -	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
> -	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
> -	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
> -	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
> +	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
> +	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
> +	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
> +	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
>   	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
>   #endif
>   #ifdef CONFIG_ZONE_DMA32
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index d325217ab201..05b7f702b3f7 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void)
>    * everything else. GFP_DMA32 page allocations automatically fall back to
>    * ZONE_DMA.
>    *
> - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
> + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
>    * generic DMA mapping code.  32-bit only devices (if not handled by an IOMMU
>    * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
>    * ZONE_DMA.
> @@ -230,6 +230,7 @@ void __init paging_init(void)
>   {
>   	unsigned long long total_ram = memblock_phys_mem_size();
>   	phys_addr_t top_of_ram = memblock_end_of_DRAM();
> +	int zone_dma_bits;
>   
>   #ifdef CONFIG_HIGHMEM
>   	unsigned long v = __fix_to_virt(FIX_KMAP_END);
> @@ -256,6 +257,8 @@ void __init paging_init(void)
>   	else
>   		zone_dma_bits = 31;
>   
> +	zone_dma_limit = DMA_BIT_MASK(zone_dma_bits);
> +
>   #ifdef CONFIG_ZONE_DMA
>   	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn,
>   				      1UL << (zone_dma_bits - PAGE_SHIFT));
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index ddcd39ef4346..91fc2b91adfc 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -97,7 +97,7 @@ void __init paging_init(void)
>   
>   	vmem_map_init();
>   	sparse_init();
> -	zone_dma_bits = 31;
> +	zone_dma_limit = DMA_BIT_MASK(31);
>   	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
>   	max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
>   	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index edbe13d00776..d7e30d4f7503 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -12,7 +12,7 @@
>   #include <linux/mem_encrypt.h>
>   #include <linux/swiotlb.h>
>   
> -extern unsigned int zone_dma_bits;
> +extern u64 zone_dma_limit;
>   
>   /*
>    * Record the mapping of CPU physical to DMA addresses for a given region.
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 4480a3cd92e0..f2ba074a6a54 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -20,7 +20,7 @@
>    * it for entirely different regions. In that case the arch code needs to
>    * override the variable below for dma-direct to work properly.
>    */
> -unsigned int zone_dma_bits __ro_after_init = 24;
> +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
>   
>   static inline dma_addr_t phys_to_dma_direct(struct device *dev,
>   		phys_addr_t phys)
> @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
>   	 * zones.
>   	 */
>   	*phys_limit = dma_to_phys(dev, dma_limit);
> -	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
> +	if (*phys_limit <= zone_dma_limit)
>   		return GFP_DMA;
>   	if (*phys_limit <= DMA_BIT_MASK(32))
>   		return GFP_DMA32;
> @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
>   	 * part of the check.
>   	 */
>   	if (IS_ENABLED(CONFIG_ZONE_DMA))
> -		min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
> +		min_mask = min_t(u64, min_mask, zone_dma_limit);
>   	return mask >= phys_to_dma_unencrypted(dev, min_mask);
>   }
>   
> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index d10613eb0f63..7b04f7575796 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
>   	/* CMA can't cross zone boundaries, see cma_activate_area() */
>   	end = cma_get_base(cma) + size - 1;
>   	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
> -		return end <= DMA_BIT_MASK(zone_dma_bits);
> +		return end <= zone_dma_limit;
>   	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> -		return end <= DMA_BIT_MASK(32);
> +		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
>   	return true;
>   }
>   
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index df68d29740a0..abcf3fa63a56 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
>   	if (!remap)
>   		io_tlb_default_mem.can_grow = true;
>   	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
> +		io_tlb_default_mem.phys_limit = zone_dma_limit;
>   	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
> +		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
>   	else
>   		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
>   #endif
> @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
>   	}
>   
>   	gfp &= ~GFP_ZONEMASK;
> -	if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
> +	if (phys_limit <= zone_dma_limit)
>   		gfp |= __GFP_DMA;
>   	else if (phys_limit <= DMA_BIT_MASK(32))
>   		gfp |= __GFP_DMA32;

Best regards
Baruch Siach Aug. 27, 2024, 4:52 a.m. UTC | #6
Hi Marek,

Thanks for your report.

On Mon, Aug 26 2024, Marek Szyprowski wrote:
> On 11.08.2024 09:09, Baruch Siach wrote:
>> From: Catalin Marinas <catalin.marinas@arm.com>
>>
>> Hardware DMA limit might not be power of 2. When RAM range starts above
>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
>> can not encode this limit.
>>
>> Use plain address for DMA zone limit.
>>
>> Since DMA zone can now potentially span beyond 4GB physical limit of
>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
>>
>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
>> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
>> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
>> ---
>
> This patch landed recently in linux-next as commit ba0fb44aed47 
> ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my 
> tests I found that it introduces the following warning on ARM64/Rockchip 
> based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts):

Does this warning go away if you revert both 3be9b846896d and ba0fb44aed47?

Upstream rockchip DTs have no dma-ranges property. Is that the case for
your platform as well?

Can you share kernel report of DMA zones and swiotlb? On my platform I get:

[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000800000000-0x000000083fffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   [mem 0x0000000840000000-0x0000000fffffffff]
...
[    0.000000] software IO TLB: area num 8.
[    0.000000] software IO TLB: mapped [mem 0x000000083be38000-0x000000083fe38000] (64MB)

What do you get at your end?

> ------------[ cut here ]------------
> dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096 
> overflow (mask ffffffff, bus limit 0).
> WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308
> Modules linked in:
> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278
> Hardware name: Hardkernel ODROID-M1 (DT)
> pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> pc : swiotlb_map+0x2f0/0x308
> lr : swiotlb_map+0x2f0/0x308
> ...
> Call trace:
>   swiotlb_map+0x2f0/0x308
>   dma_direct_map_sg+0x9c/0x2e4
>   __dma_map_sg_attrs+0x28/0x94
>   dma_map_sg_attrs+0x10/0x24
>   dw_mci_pre_dma_transfer+0xb8/0xf4
>   dw_mci_pre_req+0x50/0x68
>   mmc_blk_mq_issue_rq+0x3e0/0x964
>   mmc_mq_queue_rq+0x118/0x2b4
>   blk_mq_dispatch_rq_list+0x21c/0x714
>   __blk_mq_sched_dispatch_requests+0x490/0x58c
>   blk_mq_sched_dispatch_requests+0x30/0x6c
>   blk_mq_run_hw_queue+0x284/0x40c
>   blk_mq_flush_plug_list.part.0+0x190/0x974
>   blk_mq_flush_plug_list+0x1c/0x2c
>   __blk_flush_plug+0xe4/0x140
>   blk_finish_plug+0x38/0x4c
>   __ext4_get_inode_loc+0x22c/0x654
>   __ext4_get_inode_loc_noinmem+0x40/0xa8
>   __ext4_iget+0x154/0xcc0
>   ext4_get_journal_inode+0x30/0x110
>   ext4_load_and_init_journal+0x9c/0xaf0
>   ext4_fill_super+0x1fec/0x2d90
>   get_tree_bdev+0x140/0x1d8
>   ext4_get_tree+0x18/0x24
>   vfs_get_tree+0x28/0xe8
>   path_mount+0x3e8/0xb7c
>   init_mount+0x68/0xac
>   do_mount_root+0x108/0x1dc
>   mount_root_generic+0x100/0x330
>   mount_root+0x160/0x2d0
>   initrd_load+0x1f0/0x2a0
>   prepare_namespace+0x4c/0x29c
>   kernel_init_freeable+0x4b4/0x50c
>   kernel_init+0x20/0x1d8
>   ret_from_fork+0x10/0x20
> irq event stamp: 1305682
> hardirqs last  enabled at (1305681): [<ffff8000800e332c>] 
> console_unlock+0x124/0x130
> hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c
> softirqs last  enabled at (1305678): [<ffff80008005be1c>] 
> handle_softirqs+0x4cc/0x4e4
> softirqs last disabled at (1305665): [<ffff8000800105b0>] 
> __do_softirq+0x14/0x20
> ---[ end trace 0000000000000000 ]---
>
> This "bus limit 0" seems to be a bit suspicious to me as well as the 
> fact that swiotlb is used for the MMC DMA. I will investigate this 
> further tomorrow. The board boots fine though.

Looking at the code I guess that bus_dma_limit set to 0 means no bus
limit. But dma_mask for your device indicates 32-bit device limit. This
can't work with address above 4GB. For some reason DMA code tries to
allocate from higher address. This is most likely the reason
dma_capable() returns false.

Thanks,
baruch

>>   arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
>>   arch/powerpc/mm/mem.c      |  5 ++++-
>>   arch/s390/mm/init.c        |  2 +-
>>   include/linux/dma-direct.h |  2 +-
>>   kernel/dma/direct.c        |  6 +++---
>>   kernel/dma/pool.c          |  4 ++--
>>   kernel/dma/swiotlb.c       |  6 +++---
>>   7 files changed, 29 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>> index 9b5ab6818f7f..c45e2152ca9e 100644
>> --- a/arch/arm64/mm/init.c
>> +++ b/arch/arm64/mm/init.c
>> @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void)
>>   }
>>   
>>   /*
>> - * Return the maximum physical address for a zone accessible by the given bits
>> - * limit. If DRAM starts above 32-bit, expand the zone to the maximum
>> + * Return the maximum physical address for a zone given its limit.
>> + * If DRAM starts above 32-bit, expand the zone to the maximum
>>    * available memory, otherwise cap it at 32-bit.
>>    */
>> -static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
>> +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
>>   {
>> -	phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
>>   	phys_addr_t phys_start = memblock_start_of_DRAM();
>>   
>>   	if (phys_start > U32_MAX)
>> -		zone_mask = PHYS_ADDR_MAX;
>> -	else if (phys_start > zone_mask)
>> -		zone_mask = U32_MAX;
>> +		zone_limit = PHYS_ADDR_MAX;
>> +	else if (phys_start > zone_limit)
>> +		zone_limit = U32_MAX;
>>   
>> -	return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
>> +	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
>>   }
>>   
>>   static void __init zone_sizes_init(void)
>>   {
>>   	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
>> -	unsigned int __maybe_unused acpi_zone_dma_bits;
>> -	unsigned int __maybe_unused dt_zone_dma_bits;
>> -	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
>> +	phys_addr_t __maybe_unused acpi_zone_dma_limit;
>> +	phys_addr_t __maybe_unused dt_zone_dma_limit;
>> +	phys_addr_t __maybe_unused dma32_phys_limit =
>> +		max_zone_phys(DMA_BIT_MASK(32));
>>   
>>   #ifdef CONFIG_ZONE_DMA
>> -	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
>> -	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
>> -	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
>> -	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
>> +	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
>> +	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
>> +	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
>> +	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
>>   	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
>>   #endif
>>   #ifdef CONFIG_ZONE_DMA32
>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index d325217ab201..05b7f702b3f7 100644
>> --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void)
>>    * everything else. GFP_DMA32 page allocations automatically fall back to
>>    * ZONE_DMA.
>>    *
>> - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
>> + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
>>    * generic DMA mapping code.  32-bit only devices (if not handled by an IOMMU
>>    * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
>>    * ZONE_DMA.
>> @@ -230,6 +230,7 @@ void __init paging_init(void)
>>   {
>>   	unsigned long long total_ram = memblock_phys_mem_size();
>>   	phys_addr_t top_of_ram = memblock_end_of_DRAM();
>> +	int zone_dma_bits;
>>   
>>   #ifdef CONFIG_HIGHMEM
>>   	unsigned long v = __fix_to_virt(FIX_KMAP_END);
>> @@ -256,6 +257,8 @@ void __init paging_init(void)
>>   	else
>>   		zone_dma_bits = 31;
>>   
>> +	zone_dma_limit = DMA_BIT_MASK(zone_dma_bits);
>> +
>>   #ifdef CONFIG_ZONE_DMA
>>   	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn,
>>   				      1UL << (zone_dma_bits - PAGE_SHIFT));
>> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
>> index ddcd39ef4346..91fc2b91adfc 100644
>> --- a/arch/s390/mm/init.c
>> +++ b/arch/s390/mm/init.c
>> @@ -97,7 +97,7 @@ void __init paging_init(void)
>>   
>>   	vmem_map_init();
>>   	sparse_init();
>> -	zone_dma_bits = 31;
>> +	zone_dma_limit = DMA_BIT_MASK(31);
>>   	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
>>   	max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
>>   	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
>> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
>> index edbe13d00776..d7e30d4f7503 100644
>> --- a/include/linux/dma-direct.h
>> +++ b/include/linux/dma-direct.h
>> @@ -12,7 +12,7 @@
>>   #include <linux/mem_encrypt.h>
>>   #include <linux/swiotlb.h>
>>   
>> -extern unsigned int zone_dma_bits;
>> +extern u64 zone_dma_limit;
>>   
>>   /*
>>    * Record the mapping of CPU physical to DMA addresses for a given region.
>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>> index 4480a3cd92e0..f2ba074a6a54 100644
>> --- a/kernel/dma/direct.c
>> +++ b/kernel/dma/direct.c
>> @@ -20,7 +20,7 @@
>>    * it for entirely different regions. In that case the arch code needs to
>>    * override the variable below for dma-direct to work properly.
>>    */
>> -unsigned int zone_dma_bits __ro_after_init = 24;
>> +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
>>   
>>   static inline dma_addr_t phys_to_dma_direct(struct device *dev,
>>   		phys_addr_t phys)
>> @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
>>   	 * zones.
>>   	 */
>>   	*phys_limit = dma_to_phys(dev, dma_limit);
>> -	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
>> +	if (*phys_limit <= zone_dma_limit)
>>   		return GFP_DMA;
>>   	if (*phys_limit <= DMA_BIT_MASK(32))
>>   		return GFP_DMA32;
>> @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
>>   	 * part of the check.
>>   	 */
>>   	if (IS_ENABLED(CONFIG_ZONE_DMA))
>> -		min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
>> +		min_mask = min_t(u64, min_mask, zone_dma_limit);
>>   	return mask >= phys_to_dma_unencrypted(dev, min_mask);
>>   }
>>   
>> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
>> index d10613eb0f63..7b04f7575796 100644
>> --- a/kernel/dma/pool.c
>> +++ b/kernel/dma/pool.c
>> @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp)
>>   	/* CMA can't cross zone boundaries, see cma_activate_area() */
>>   	end = cma_get_base(cma) + size - 1;
>>   	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
>> -		return end <= DMA_BIT_MASK(zone_dma_bits);
>> +		return end <= zone_dma_limit;
>>   	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
>> -		return end <= DMA_BIT_MASK(32);
>> +		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
>>   	return true;
>>   }
>>   
>> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
>> index df68d29740a0..abcf3fa63a56 100644
>> --- a/kernel/dma/swiotlb.c
>> +++ b/kernel/dma/swiotlb.c
>> @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
>>   	if (!remap)
>>   		io_tlb_default_mem.can_grow = true;
>>   	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
>> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
>> +		io_tlb_default_mem.phys_limit = zone_dma_limit;
>>   	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
>> -		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
>> +		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
>>   	else
>>   		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
>>   #endif
>> @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
>>   	}
>>   
>>   	gfp &= ~GFP_ZONEMASK;
>> -	if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
>> +	if (phys_limit <= zone_dma_limit)
>>   		gfp |= __GFP_DMA;
>>   	else if (phys_limit <= DMA_BIT_MASK(32))
>>   		gfp |= __GFP_DMA32;
>
> Best regards
Marek Szyprowski Aug. 27, 2024, 6:14 a.m. UTC | #7
On 27.08.2024 06:52, Baruch Siach wrote:
> Hi Marek,
>
> Thanks for your report.
>
> On Mon, Aug 26 2024, Marek Szyprowski wrote:
>> On 11.08.2024 09:09, Baruch Siach wrote:
>>> From: Catalin Marinas <catalin.marinas@arm.com>
>>>
>>> Hardware DMA limit might not be power of 2. When RAM range starts above
>>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
>>> can not encode this limit.
>>>
>>> Use plain address for DMA zone limit.
>>>
>>> Since DMA zone can now potentially span beyond 4GB physical limit of
>>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
>>>
>>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
>>> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
>>> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
>>> ---
>> This patch landed recently in linux-next as commit ba0fb44aed47
>> ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my
>> tests I found that it introduces the following warning on ARM64/Rockchip
>> based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts):
> Does this warning go away if you revert both 3be9b846896d and ba0fb44aed47?

Yes, linux-next with above mentioned commits reverted works fine.


> Upstream rockchip DTs have no dma-ranges property. Is that the case for
> your platform as well?
>
> Can you share kernel report of DMA zones and swiotlb? On my platform I get:
>
> [    0.000000] Zone ranges:
> [    0.000000]   DMA      [mem 0x0000000800000000-0x000000083fffffff]
> [    0.000000]   DMA32    empty
> [    0.000000]   Normal   [mem 0x0000000840000000-0x0000000fffffffff]
> ...
> [    0.000000] software IO TLB: area num 8.
> [    0.000000] software IO TLB: mapped [mem 0x000000083be38000-0x000000083fe38000] (64MB)
>
> What do you get at your end?

On ba0fb44aed47 I got:

[    0.000000] NUMA: No NUMA configuration found
[    0.000000] NUMA: Faking a node at [mem 
0x0000000000200000-0x00000001ffffffff]
[    0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff]
[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000000200000-0x00000001ffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   empty
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000000200000-0x00000000083fffff]
[    0.000000]   node   0: [mem 0x0000000009400000-0x00000000efffffff]
[    0.000000]   node   0: [mem 0x00000001f0000000-0x00000001ffffffff]
[    0.000000] Initmem setup node 0 [mem 
0x0000000000200000-0x00000001ffffffff]
[    0.000000] On node 0, zone DMA: 512 pages in unavailable ranges
[    0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges
[    0.000000] cma: Reserved 96 MiB at 0x00000001f0000000 on node -1

...

[    0.000000] software IO TLB: SWIOTLB bounce buffer size adjusted to 3MB
[    0.000000] software IO TLB: area num 4.
[    0.000000] software IO TLB: mapped [mem 
0x00000001fac00000-0x00000001fb000000] (4MB)

On the fa3c109a6d30 (parent commit of the $subject) I got:

[    0.000000] NUMA: No NUMA configuration found
[    0.000000] NUMA: Faking a node at [mem 
0x0000000000200000-0x00000001ffffffff]
[    0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff]
[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000000200000-0x00000000ffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   [mem 0x0000000100000000-0x00000001ffffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000000200000-0x00000000083fffff]
[    0.000000]   node   0: [mem 0x0000000009400000-0x00000000efffffff]
[    0.000000]   node   0: [mem 0x00000001f0000000-0x00000001ffffffff]
[    0.000000] Initmem setup node 0 [mem 
0x0000000000200000-0x00000001ffffffff]
[    0.000000] On node 0, zone DMA: 512 pages in unavailable ranges
[    0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges
[    0.000000] cma: Reserved 96 MiB at 0x00000000ea000000 on node -1

...

[    0.000000] software IO TLB: area num 4.
[    0.000000] software IO TLB: mapped [mem 
0x00000000e6000000-0x00000000ea000000] (64MB)

It looks that for some reasons $subject patch changes the default zone 
and swiotlb configuration.

>> ------------[ cut here ]------------
>> dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096
>> overflow (mask ffffffff, bus limit 0).
>> WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308
>> Modules linked in:
>> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278
>> Hardware name: Hardkernel ODROID-M1 (DT)
>> pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>> pc : swiotlb_map+0x2f0/0x308
>> lr : swiotlb_map+0x2f0/0x308
>> ...
>> Call trace:
>>    swiotlb_map+0x2f0/0x308
>>    dma_direct_map_sg+0x9c/0x2e4
>>    __dma_map_sg_attrs+0x28/0x94
>>    dma_map_sg_attrs+0x10/0x24
>>    dw_mci_pre_dma_transfer+0xb8/0xf4
>>    dw_mci_pre_req+0x50/0x68
>>    mmc_blk_mq_issue_rq+0x3e0/0x964
>>    mmc_mq_queue_rq+0x118/0x2b4
>>    blk_mq_dispatch_rq_list+0x21c/0x714
>>    __blk_mq_sched_dispatch_requests+0x490/0x58c
>>    blk_mq_sched_dispatch_requests+0x30/0x6c
>>    blk_mq_run_hw_queue+0x284/0x40c
>>    blk_mq_flush_plug_list.part.0+0x190/0x974
>>    blk_mq_flush_plug_list+0x1c/0x2c
>>    __blk_flush_plug+0xe4/0x140
>>    blk_finish_plug+0x38/0x4c
>>    __ext4_get_inode_loc+0x22c/0x654
>>    __ext4_get_inode_loc_noinmem+0x40/0xa8
>>    __ext4_iget+0x154/0xcc0
>>    ext4_get_journal_inode+0x30/0x110
>>    ext4_load_and_init_journal+0x9c/0xaf0
>>    ext4_fill_super+0x1fec/0x2d90
>>    get_tree_bdev+0x140/0x1d8
>>    ext4_get_tree+0x18/0x24
>>    vfs_get_tree+0x28/0xe8
>>    path_mount+0x3e8/0xb7c
>>    init_mount+0x68/0xac
>>    do_mount_root+0x108/0x1dc
>>    mount_root_generic+0x100/0x330
>>    mount_root+0x160/0x2d0
>>    initrd_load+0x1f0/0x2a0
>>    prepare_namespace+0x4c/0x29c
>>    kernel_init_freeable+0x4b4/0x50c
>>    kernel_init+0x20/0x1d8
>>    ret_from_fork+0x10/0x20
>> irq event stamp: 1305682
>> hardirqs last  enabled at (1305681): [<ffff8000800e332c>]
>> console_unlock+0x124/0x130
>> hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c
>> softirqs last  enabled at (1305678): [<ffff80008005be1c>]
>> handle_softirqs+0x4cc/0x4e4
>> softirqs last disabled at (1305665): [<ffff8000800105b0>]
>> __do_softirq+0x14/0x20
>> ---[ end trace 0000000000000000 ]---
>>
>> This "bus limit 0" seems to be a bit suspicious to me as well as the
>> fact that swiotlb is used for the MMC DMA. I will investigate this
>> further tomorrow. The board boots fine though.
> Looking at the code I guess that bus_dma_limit set to 0 means no bus
> limit. But dma_mask for your device indicates 32-bit device limit. This
> can't work with address above 4GB. For some reason DMA code tries to
> allocate from higher address. This is most likely the reason
> dma_capable() returns false.

Indeed this looks like a source of the problem:

[    3.123618] Synopsys Designware Multimedia Card Interface Driver
[    3.139653] dwmmc_rockchip fe2b0000.mmc: IDMAC supports 32-bit 
address mode.
[    3.147739] dwmmc_rockchip fe2b0000.mmc: Using internal DMA controller.
[    3.161659] dwmmc_rockchip fe2b0000.mmc: Version ID is 270a
[    3.168455] dwmmc_rockchip fe2b0000.mmc: DW MMC controller at irq 
56,32 bit host data width,256 deep fifo
[    3.182651] dwmmc_rockchip fe2b0000.mmc: Got CD GPIO

...

[   11.009258] ------------[ cut here ]------------
[   11.014762] dwmmc_rockchip fe2b0000.mmc: swiotlb addr 
0x00000001faf00000+4096 overflow (mask ffffffff, bus limit 0).


> ...

Best regards
Baruch Siach Aug. 27, 2024, 7:03 a.m. UTC | #8
Hi Marek,

On Tue, Aug 27 2024, Marek Szyprowski wrote:
> On 27.08.2024 06:52, Baruch Siach wrote:
>> Hi Marek,
>>
>> Thanks for your report.
>>
>> On Mon, Aug 26 2024, Marek Szyprowski wrote:
>>> On 11.08.2024 09:09, Baruch Siach wrote:
>>>> From: Catalin Marinas <catalin.marinas@arm.com>
>>>>
>>>> Hardware DMA limit might not be power of 2. When RAM range starts above
>>>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
>>>> can not encode this limit.
>>>>
>>>> Use plain address for DMA zone limit.
>>>>
>>>> Since DMA zone can now potentially span beyond 4GB physical limit of
>>>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
>>>>
>>>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
>>>> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
>>>> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
>>>> ---
>>> This patch landed recently in linux-next as commit ba0fb44aed47
>>> ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my
>>> tests I found that it introduces the following warning on ARM64/Rockchip
>>> based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts):
>> Does this warning go away if you revert both 3be9b846896d and ba0fb44aed47?
>
> Yes, linux-next with above mentioned commits reverted works fine.
>
>
>> Upstream rockchip DTs have no dma-ranges property. Is that the case for
>> your platform as well?
>>
>> Can you share kernel report of DMA zones and swiotlb? On my platform I get:
>>
>> [    0.000000] Zone ranges:
>> [    0.000000]   DMA      [mem 0x0000000800000000-0x000000083fffffff]
>> [    0.000000]   DMA32    empty
>> [    0.000000]   Normal   [mem 0x0000000840000000-0x0000000fffffffff]
>> ...
>> [    0.000000] software IO TLB: area num 8.
>> [    0.000000] software IO TLB: mapped [mem 0x000000083be38000-0x000000083fe38000] (64MB)
>>
>> What do you get at your end?
>
> On ba0fb44aed47 I got:
>
> [    0.000000] NUMA: No NUMA configuration found
> [    0.000000] NUMA: Faking a node at [mem 
> 0x0000000000200000-0x00000001ffffffff]
> [    0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff]
> [    0.000000] Zone ranges:
> [    0.000000]   DMA      [mem 0x0000000000200000-0x00000001ffffffff]
> [    0.000000]   DMA32    empty
> [    0.000000]   Normal   empty
> [    0.000000] Movable zone start for each node
> [    0.000000] Early memory node ranges
> [    0.000000]   node   0: [mem 0x0000000000200000-0x00000000083fffff]
> [    0.000000]   node   0: [mem 0x0000000009400000-0x00000000efffffff]
> [    0.000000]   node   0: [mem 0x00000001f0000000-0x00000001ffffffff]
> [    0.000000] Initmem setup node 0 [mem 
> 0x0000000000200000-0x00000001ffffffff]
> [    0.000000] On node 0, zone DMA: 512 pages in unavailable ranges
> [    0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges
> [    0.000000] cma: Reserved 96 MiB at 0x00000001f0000000 on node -1
>
> ...
>
> [    0.000000] software IO TLB: SWIOTLB bounce buffer size adjusted to 3MB
> [    0.000000] software IO TLB: area num 4.
> [    0.000000] software IO TLB: mapped [mem 
> 0x00000001fac00000-0x00000001fb000000] (4MB)
>
> On the fa3c109a6d30 (parent commit of the $subject) I got:
>
> [    0.000000] NUMA: No NUMA configuration found
> [    0.000000] NUMA: Faking a node at [mem 
> 0x0000000000200000-0x00000001ffffffff]
> [    0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff]
> [    0.000000] Zone ranges:
> [    0.000000]   DMA      [mem 0x0000000000200000-0x00000000ffffffff]
> [    0.000000]   DMA32    empty
> [    0.000000]   Normal   [mem 0x0000000100000000-0x00000001ffffffff]
> [    0.000000] Movable zone start for each node
> [    0.000000] Early memory node ranges
> [    0.000000]   node   0: [mem 0x0000000000200000-0x00000000083fffff]
> [    0.000000]   node   0: [mem 0x0000000009400000-0x00000000efffffff]
> [    0.000000]   node   0: [mem 0x00000001f0000000-0x00000001ffffffff]
> [    0.000000] Initmem setup node 0 [mem 
> 0x0000000000200000-0x00000001ffffffff]
> [    0.000000] On node 0, zone DMA: 512 pages in unavailable ranges
> [    0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges
> [    0.000000] cma: Reserved 96 MiB at 0x00000000ea000000 on node -1
>
> ...
>
> [    0.000000] software IO TLB: area num 4.
> [    0.000000] software IO TLB: mapped [mem 
> 0x00000000e6000000-0x00000000ea000000] (64MB)
>
> It looks that for some reasons $subject patch changes the default zone 
> and swiotlb configuration.

Does this fix the issue?

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index bfb10969cbf0..7fcd0aaa9bb6 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -116,6 +116,9 @@ static void __init arch_reserve_crashkernel(void)
 
 static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
 {
+	if (memblock_start_of_DRAM() < U32_MAX)
+		zone_limit = min(zone_limit, U32_MAX);
+
 	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
 }
 

Thanks,
baruch

>>> ------------[ cut here ]------------
>>> dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096
>>> overflow (mask ffffffff, bus limit 0).
>>> WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308
>>> Modules linked in:
>>> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278
>>> Hardware name: Hardkernel ODROID-M1 (DT)
>>> pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>>> pc : swiotlb_map+0x2f0/0x308
>>> lr : swiotlb_map+0x2f0/0x308
>>> ...
>>> Call trace:
>>>    swiotlb_map+0x2f0/0x308
>>>    dma_direct_map_sg+0x9c/0x2e4
>>>    __dma_map_sg_attrs+0x28/0x94
>>>    dma_map_sg_attrs+0x10/0x24
>>>    dw_mci_pre_dma_transfer+0xb8/0xf4
>>>    dw_mci_pre_req+0x50/0x68
>>>    mmc_blk_mq_issue_rq+0x3e0/0x964
>>>    mmc_mq_queue_rq+0x118/0x2b4
>>>    blk_mq_dispatch_rq_list+0x21c/0x714
>>>    __blk_mq_sched_dispatch_requests+0x490/0x58c
>>>    blk_mq_sched_dispatch_requests+0x30/0x6c
>>>    blk_mq_run_hw_queue+0x284/0x40c
>>>    blk_mq_flush_plug_list.part.0+0x190/0x974
>>>    blk_mq_flush_plug_list+0x1c/0x2c
>>>    __blk_flush_plug+0xe4/0x140
>>>    blk_finish_plug+0x38/0x4c
>>>    __ext4_get_inode_loc+0x22c/0x654
>>>    __ext4_get_inode_loc_noinmem+0x40/0xa8
>>>    __ext4_iget+0x154/0xcc0
>>>    ext4_get_journal_inode+0x30/0x110
>>>    ext4_load_and_init_journal+0x9c/0xaf0
>>>    ext4_fill_super+0x1fec/0x2d90
>>>    get_tree_bdev+0x140/0x1d8
>>>    ext4_get_tree+0x18/0x24
>>>    vfs_get_tree+0x28/0xe8
>>>    path_mount+0x3e8/0xb7c
>>>    init_mount+0x68/0xac
>>>    do_mount_root+0x108/0x1dc
>>>    mount_root_generic+0x100/0x330
>>>    mount_root+0x160/0x2d0
>>>    initrd_load+0x1f0/0x2a0
>>>    prepare_namespace+0x4c/0x29c
>>>    kernel_init_freeable+0x4b4/0x50c
>>>    kernel_init+0x20/0x1d8
>>>    ret_from_fork+0x10/0x20
>>> irq event stamp: 1305682
>>> hardirqs last  enabled at (1305681): [<ffff8000800e332c>]
>>> console_unlock+0x124/0x130
>>> hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c
>>> softirqs last  enabled at (1305678): [<ffff80008005be1c>]
>>> handle_softirqs+0x4cc/0x4e4
>>> softirqs last disabled at (1305665): [<ffff8000800105b0>]
>>> __do_softirq+0x14/0x20
>>> ---[ end trace 0000000000000000 ]---
>>>
>>> This "bus limit 0" seems to be a bit suspicious to me as well as the
>>> fact that swiotlb is used for the MMC DMA. I will investigate this
>>> further tomorrow. The board boots fine though.
>> Looking at the code I guess that bus_dma_limit set to 0 means no bus
>> limit. But dma_mask for your device indicates 32-bit device limit. This
>> can't work with address above 4GB. For some reason DMA code tries to
>> allocate from higher address. This is most likely the reason
>> dma_capable() returns false.
>
> Indeed this looks like a source of the problem:
>
> [    3.123618] Synopsys Designware Multimedia Card Interface Driver
> [    3.139653] dwmmc_rockchip fe2b0000.mmc: IDMAC supports 32-bit 
> address mode.
> [    3.147739] dwmmc_rockchip fe2b0000.mmc: Using internal DMA controller.
> [    3.161659] dwmmc_rockchip fe2b0000.mmc: Version ID is 270a
> [    3.168455] dwmmc_rockchip fe2b0000.mmc: DW MMC controller at irq 
> 56,32 bit host data width,256 deep fifo
> [    3.182651] dwmmc_rockchip fe2b0000.mmc: Got CD GPIO
>
> ...
>
> [   11.009258] ------------[ cut here ]------------
> [   11.014762] dwmmc_rockchip fe2b0000.mmc: swiotlb addr 
> 0x00000001faf00000+4096 overflow (mask ffffffff, bus limit 0).
>
>
>> ...
>
> Best regards
Marek Szyprowski Aug. 27, 2024, 7:46 a.m. UTC | #9
On 27.08.2024 09:03, Baruch Siach wrote:
> On Tue, Aug 27 2024, Marek Szyprowski wrote:
>> On 27.08.2024 06:52, Baruch Siach wrote:
>>> Hi Marek,
>>>
>>> Thanks for your report.
>>>
>>> On Mon, Aug 26 2024, Marek Szyprowski wrote:
>>>> On 11.08.2024 09:09, Baruch Siach wrote:
>>>>> From: Catalin Marinas <catalin.marinas@arm.com>
>>>>>
>>>>> Hardware DMA limit might not be power of 2. When RAM range starts above
>>>>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
>>>>> can not encode this limit.
>>>>>
>>>>> Use plain address for DMA zone limit.
>>>>>
>>>>> Since DMA zone can now potentially span beyond 4GB physical limit of
>>>>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
>>>>>
>>>>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
>>>>> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
>>>>> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
>>>>> ---
>>>> This patch landed recently in linux-next as commit ba0fb44aed47
>>>> ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my
>>>> tests I found that it introduces the following warning on ARM64/Rockchip
>>>> based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts):
>>> Does this warning go away if you revert both 3be9b846896d and ba0fb44aed47?
>> Yes, linux-next with above mentioned commits reverted works fine.
>>
>>
>>> Upstream rockchip DTs have no dma-ranges property. Is that the case for
>>> your platform as well?
>>>
>>> Can you share kernel report of DMA zones and swiotlb? On my platform I get:
>>>
>>> [    0.000000] Zone ranges:
>>> [    0.000000]   DMA      [mem 0x0000000800000000-0x000000083fffffff]
>>> [    0.000000]   DMA32    empty
>>> [    0.000000]   Normal   [mem 0x0000000840000000-0x0000000fffffffff]
>>> ...
>>> [    0.000000] software IO TLB: area num 8.
>>> [    0.000000] software IO TLB: mapped [mem 0x000000083be38000-0x000000083fe38000] (64MB)
>>>
>>> What do you get at your end?
>> On ba0fb44aed47 I got:
>>
>> [    0.000000] NUMA: No NUMA configuration found
>> [    0.000000] NUMA: Faking a node at [mem
>> 0x0000000000200000-0x00000001ffffffff]
>> [    0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff]
>> [    0.000000] Zone ranges:
>> [    0.000000]   DMA      [mem 0x0000000000200000-0x00000001ffffffff]
>> [    0.000000]   DMA32    empty
>> [    0.000000]   Normal   empty
>> [    0.000000] Movable zone start for each node
>> [    0.000000] Early memory node ranges
>> [    0.000000]   node   0: [mem 0x0000000000200000-0x00000000083fffff]
>> [    0.000000]   node   0: [mem 0x0000000009400000-0x00000000efffffff]
>> [    0.000000]   node   0: [mem 0x00000001f0000000-0x00000001ffffffff]
>> [    0.000000] Initmem setup node 0 [mem
>> 0x0000000000200000-0x00000001ffffffff]
>> [    0.000000] On node 0, zone DMA: 512 pages in unavailable ranges
>> [    0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges
>> [    0.000000] cma: Reserved 96 MiB at 0x00000001f0000000 on node -1
>>
>> ...
>>
>> [    0.000000] software IO TLB: SWIOTLB bounce buffer size adjusted to 3MB
>> [    0.000000] software IO TLB: area num 4.
>> [    0.000000] software IO TLB: mapped [mem
>> 0x00000001fac00000-0x00000001fb000000] (4MB)
>>
>> On the fa3c109a6d30 (parent commit of the $subject) I got:
>>
>> [    0.000000] NUMA: No NUMA configuration found
>> [    0.000000] NUMA: Faking a node at [mem
>> 0x0000000000200000-0x00000001ffffffff]
>> [    0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff]
>> [    0.000000] Zone ranges:
>> [    0.000000]   DMA      [mem 0x0000000000200000-0x00000000ffffffff]
>> [    0.000000]   DMA32    empty
>> [    0.000000]   Normal   [mem 0x0000000100000000-0x00000001ffffffff]
>> [    0.000000] Movable zone start for each node
>> [    0.000000] Early memory node ranges
>> [    0.000000]   node   0: [mem 0x0000000000200000-0x00000000083fffff]
>> [    0.000000]   node   0: [mem 0x0000000009400000-0x00000000efffffff]
>> [    0.000000]   node   0: [mem 0x00000001f0000000-0x00000001ffffffff]
>> [    0.000000] Initmem setup node 0 [mem
>> 0x0000000000200000-0x00000001ffffffff]
>> [    0.000000] On node 0, zone DMA: 512 pages in unavailable ranges
>> [    0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges
>> [    0.000000] cma: Reserved 96 MiB at 0x00000000ea000000 on node -1
>>
>> ...
>>
>> [    0.000000] software IO TLB: area num 4.
>> [    0.000000] software IO TLB: mapped [mem
>> 0x00000000e6000000-0x00000000ea000000] (64MB)
>>
>> It looks that for some reasons $subject patch changes the default zone
>> and swiotlb configuration.
> Does this fix the issue?
>
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index bfb10969cbf0..7fcd0aaa9bb6 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -116,6 +116,9 @@ static void __init arch_reserve_crashkernel(void)
>   
>   static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
>   {
> +	if (memblock_start_of_DRAM() < U32_MAX)
> +		zone_limit = min(zone_limit, U32_MAX);
> +
>   	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
>   }
>   

Yes, this fixes my issue. Thanks!

Fell free to add:

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>

Best regards
Neil Armstrong Aug. 29, 2024, 1:42 p.m. UTC | #10
Hi,

On 11/08/2024 09:09, Baruch Siach wrote:
> From: Catalin Marinas <catalin.marinas@arm.com>
> 
> Hardware DMA limit might not be power of 2. When RAM range starts above
> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
> can not encode this limit.
> 
> Use plain address for DMA zone limit.
> 
> Since DMA zone can now potentially span beyond 4GB physical limit of
> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
> 
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
> ---
>   arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
>   arch/powerpc/mm/mem.c      |  5 ++++-
>   arch/s390/mm/init.c        |  2 +-
>   include/linux/dma-direct.h |  2 +-
>   kernel/dma/direct.c        |  6 +++---
>   kernel/dma/pool.c          |  4 ++--
>   kernel/dma/swiotlb.c       |  6 +++---
>   7 files changed, 29 insertions(+), 26 deletions(-)
> 

<snip>

This change breaks the Qualcomm SM8550-HDK boot since next-20240826.
It doesn't affect SM8550-QRD or other similar SoCs like SM8650 or SM8450.
The last CI run on next-20240828 can be found at:
https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/pipelines/100936

SM8550-HDK boot log:
https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/jobs/165617

bisect log:
# bad: [b18bbfc14a38b5234e09c2adcf713e38063a7e6e] Add linux-next specific files for 20240829
# good: [5be63fc19fcaa4c236b307420483578a56986a37] Linux 6.11-rc5
git bisect start 'FETCH_HEAD' 'v6.11-rc5'
# bad: [dc09f0263a0accf41821d260f4bf7ad9a4f7b7d8] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
git bisect bad dc09f0263a0accf41821d260f4bf7ad9a4f7b7d8
# bad: [97c7e618752776e03f50311400bb73c01489fb17] Merge branch 'for-next' of git://github.com/Xilinx/linux-xlnx.git
git bisect bad 97c7e618752776e03f50311400bb73c01489fb17
# good: [27ad8eb339a5e3f96aed5f3a3b5901994ce7856d] Merge branch 'clang-format' of https://github.com/ojeda/linux.git
git bisect good 27ad8eb339a5e3f96aed5f3a3b5901994ce7856d
# bad: [fd34d49f6d8dbd157b17b675dc51a145cdad580c] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/khilman/linux-omap.git
git bisect bad fd34d49f6d8dbd157b17b675dc51a145cdad580c
# bad: [29f35e8cf8bff7c69a740edb1cf3d62d211f5a43] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux.git
git bisect bad 29f35e8cf8bff7c69a740edb1cf3d62d211f5a43
# good: [41860d49473c0c09dc0a2a4d148047f97aaa2539] perf sched: Use perf_tool__init()
git bisect good 41860d49473c0c09dc0a2a4d148047f97aaa2539
# good: [6236ebe07131a7746d870f1d8eb3637a8df13e70] perf daemon: Fix the build on more 32-bit architectures
git bisect good 6236ebe07131a7746d870f1d8eb3637a8df13e70
# good: [92b0d033c80c882e6be26dfeeb2f24c53bdeeee6] Merge branches 'for-next/acpi', 'for-next/misc', 'for-next/perf', 'for-next/selftests' and 'for-next/timers' into for-next/core
git bisect good 92b0d033c80c882e6be26dfeeb2f24c53bdeeee6
# bad: [5db893307df81f0c1dd6b6f167dc263e88ba855a] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rmk/linux.git
git bisect bad 5db893307df81f0c1dd6b6f167dc263e88ba855a
# bad: [f65eaf209abc5f4761623aaa1e110bdb6de124ed] Merge branch 'for-next' of git://git.infradead.org/users/hch/dma-mapping.git
git bisect bad f65eaf209abc5f4761623aaa1e110bdb6de124ed
# bad: [f69e342eec008e1bab772d3963c3dd9979293e13] dma-mapping: call ->unmap_page and ->unmap_sg unconditionally
git bisect bad f69e342eec008e1bab772d3963c3dd9979293e13
# bad: [ba0fb44aed47693cc2482427f63ba6cd19051327] dma-mapping: replace zone_dma_bits by zone_dma_limit
git bisect bad ba0fb44aed47693cc2482427f63ba6cd19051327
# good: [fa3c109a6d302b56437a6412c5f3044c3e12de03] dma-mapping: use bit masking to check VM_DMA_COHERENT
git bisect good fa3c109a6d302b56437a6412c5f3044c3e12de03
# first bad commit: [ba0fb44aed47693cc2482427f63ba6cd19051327] dma-mapping: replace zone_dma_bits by zone_dma_limit

Boot log with earlycon:
[    0.000000] Booting Linux on physical CPU 0x0000000000 [0x411fd461]
[    0.000000] Linux version 6.11.0-rc5-next-20240829-07694-gb36219e63903 (narmstrong@arrakeen.starnux.net) (aarch64-none-linux-gnu-gcc (Arm GNU Toolchain 13.2.rel1 (Build arm-13.7)) 13.2.1 20231009, GNU ld (Arm GNU Toolchain 13.2.rel1 (Build arm-13.7)) 2.41.0.20231009) #397 SMP PREEMPT Thu Aug 29 15:25:39 CEST 2024
[    0.000000] KASLR enabled
[    0.000000] random: crng init done
[    0.000000] Machine model: Qualcomm Technologies, Inc. SM8550 HDK
[    0.000000] printk: debug: ignoring loglevel setting.
[    0.000000] efi: UEFI not found.
[    0.000000] [Firmware Bug]: Kernel image misaligned at boot, please fix your bootloader!
[    0.000000] OF: reserved mem: 0x0000000080000000..0x00000000809fffff (10240 KiB) nomap non-reusable hyp-region@80000000
[    0.000000] OF: reserved mem: 0x0000000080a00000..0x0000000080dfffff (4096 KiB) nomap non-reusable cpusys-vm-region@80a00000
[    0.000000] OF: reserved mem: 0x0000000080e00000..0x00000000811cffff (3904 KiB) nomap non-reusable hyp-tags-region@80e00000
[    0.000000] OF: reserved mem: 0x00000000811d0000..0x00000000811fffff (192 KiB) nomap non-reusable hyp-tags-reserved-region@811d0000
[    0.000000] OF: reserved mem: 0x0000000081a00000..0x0000000081c5ffff (2432 KiB) nomap non-reusable xbl-dt-log-merged-region@81a00000
[    0.000000] OF: reserved mem: 0x0000000081c60000..0x0000000081c7ffff (128 KiB) nomap non-reusable aop-cmd-db-region@81c60000
[    0.000000] OF: reserved mem: 0x0000000081c80000..0x0000000081cf3fff (464 KiB) nomap non-reusable aop-config-merged-region@81c80000
[    0.000000] OF: reserved mem: 0x0000000081d00000..0x0000000081efffff (2048 KiB) nomap non-reusable smem@81d00000
[    0.000000] OF: reserved mem: 0x0000000081f00000..0x0000000081f1ffff (128 KiB) nomap non-reusable adsp-mhi-region@81f00000
[    0.000000] OF: reserved mem: 0x0000000082600000..0x00000000826fffff (1024 KiB) nomap non-reusable global-sync-region@82600000
[    0.000000] OF: reserved mem: 0x0000000082700000..0x00000000827fffff (1024 KiB) nomap non-reusable tz-stat-region@82700000
[    0.000000] OF: reserved mem: 0x0000000082800000..0x0000000086dfffff (71680 KiB) nomap non-reusable cdsp-secure-heap-region@82800000
[    0.000000] OF: reserved mem: 0x000000008a800000..0x000000009affffff (270336 KiB) nomap non-reusable mpss-region@8a800000
[    0.000000] OF: reserved mem: 0x000000009b000000..0x000000009b07ffff (512 KiB) nomap non-reusable q6-mpss-dtb-region@9b000000
[    0.000000] OF: reserved mem: 0x000000009b080000..0x000000009b08ffff (64 KiB) nomap non-reusable ipa-fw-region@9b080000
[    0.000000] OF: reserved mem: 0x000000009b090000..0x000000009b099fff (40 KiB) nomap non-reusable ipa-gsi-region@9b090000
[    0.000000] OF: reserved mem: 0x000000009b09a000..0x000000009b09bfff (8 KiB) nomap non-reusable gpu-micro-code-region@9b09a000
[    0.000000] OF: reserved mem: 0x000000009b100000..0x000000009b27ffff (1536 KiB) nomap non-reusable spss-region@9b100000
[    0.000000] OF: reserved mem: 0x000000009b280000..0x000000009b2dffff (384 KiB) nomap non-reusable spu-tz-shared-region@9b280000
[    0.000000] OF: reserved mem: 0x000000009b2e0000..0x000000009b2fffff (128 KiB) nomap non-reusable spu-modem-shared-region@9b2e0000
[    0.000000] OF: reserved mem: 0x000000009b300000..0x000000009bafffff (8192 KiB) nomap non-reusable camera-region@9b300000
[    0.000000] OF: reserved mem: 0x000000009bb00000..0x000000009c1fffff (7168 KiB) nomap non-reusable video-region@9bb00000
[    0.000000] OF: reserved mem: 0x000000009c200000..0x000000009c8fffff (7168 KiB) nomap non-reusable cvp-region@9c200000
[    0.000000] OF: reserved mem: 0x000000009c900000..0x000000009e8fffff (32768 KiB) nomap non-reusable cdsp-region@9c900000
[    0.000000] OF: reserved mem: 0x000000009e900000..0x000000009e97ffff (512 KiB) nomap non-reusable q6-cdsp-dtb-region@9e900000
[    0.000000] OF: reserved mem: 0x000000009e980000..0x000000009e9fffff (512 KiB) nomap non-reusable q6-adsp-dtb-region@9e980000
[    0.000000] OF: reserved mem: 0x000000009ea00000..0x00000000a2a7ffff (66048 KiB) nomap non-reusable adspslpi-region@9ea00000
[    0.000000] OF: reserved mem: 0x00000000d4a80000..0x00000000d4cfffff (2560 KiB) nomap non-reusable rmtfs-region@d4a80000
[    0.000000] OF: reserved mem: 0x00000000d4d00000..0x00000000d7ffffff (52224 KiB) nomap non-reusable mpss-dsm-region@d4d00000
[    0.000000] OF: reserved mem: 0x00000000d8000000..0x00000000d80fffff (1024 KiB) nomap non-reusable tz-reserved-region@d8000000
[    0.000000] OF: reserved mem: 0x00000000d8100000..0x00000000d813ffff (256 KiB) nomap non-reusable xbl-sc-region@d8100000
[    0.000000] OF: reserved mem: 0x00000000d8140000..0x00000000d82fffff (1792 KiB) nomap non-reusable cpucp-fw-region@d8140000
[    0.000000] OF: reserved mem: 0x00000000d8300000..0x00000000d87fffff (5120 KiB) nomap non-reusable qtee-region@d8300000
[    0.000000] OF: reserved mem: 0x00000000d8800000..0x00000000e11fffff (141312 KiB) nomap non-reusable ta-region@d8800000
[    0.000000] OF: reserved mem: 0x00000000e1200000..0x00000000e393ffff (40192 KiB) nomap non-reusable tz-tags-region@e1200000
[    0.000000] OF: reserved mem: 0x00000000e6440000..0x00000000e66b8fff (2532 KiB) nomap non-reusable hwfence-shbuf-region@e6440000
[    0.000000] OF: reserved mem: 0x00000000f3600000..0x00000000f80edfff (76728 KiB) nomap non-reusable trust-ui-vm-region@f3600000
[    0.000000] OF: reserved mem: 0x00000000f80ee000..0x00000000f80eefff (4 KiB) nomap non-reusable trust-ui-vm-dump-region@f80ee000
[    0.000000] OF: reserved mem: 0x00000000f80ef000..0x00000000f80f7fff (36 KiB) nomap non-reusable trust-ui-vm-qrt-region@f80ef000
[    0.000000] OF: reserved mem: 0x00000000f80f8000..0x00000000f80fbfff (16 KiB) nomap non-reusable trust-ui-vm-vblk0-ring-region@f80f8000
[    0.000000] OF: reserved mem: 0x00000000f80fc000..0x00000000f80fffff (16 KiB) nomap non-reusable trust-ui-vm-vblk1-ring-region@f80fc000
[    0.000000] OF: reserved mem: 0x00000000f8100000..0x00000000f81fffff (1024 KiB) nomap non-reusable trust-ui-vm-swiotlb-region@f8100000
[    0.000000] OF: reserved mem: 0x00000000f8400000..0x00000000fcbfffff (73728 KiB) nomap non-reusable oem-vm-region@f8400000
[    0.000000] OF: reserved mem: 0x00000000fcc00000..0x00000000fcc03fff (16 KiB) nomap non-reusable oem-vm-vblk0-ring-region@fcc00000
[    0.000000] OF: reserved mem: 0x00000000fcc04000..0x00000000fcd03fff (1024 KiB) nomap non-reusable oem-vm-swiotlb-region@fcc04000
[    0.000000] OF: reserved mem: 0x00000000fce00000..0x00000000ff6fffff (41984 KiB) nomap non-reusable hyp-ext-tags-region@fce00000
[    0.000000] OF: reserved mem: 0x00000000ff700000..0x00000000ff7fffff (1024 KiB) nomap non-reusable hyp-ext-reserved-region@ff700000
[    0.000000] earlycon: qcom_geni0 at MMIO 0x0000000000a9c000 (options '115200n8')
[    0.000000] printk: legacy bootconsole [qcom_geni0] enabled
[    0.000000] NUMA: Faking a node at [mem 0x0000000080000000-0x0000000affffffff]
[    0.000000] NODE_DATA(0) allocated [mem 0xafe933200-0xafe93583f]
[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000080000000-0x0000000affffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   empty
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000080000000-0x0000000080dfffff]
[    0.000000]   node   0: [mem 0x00000000811d0000-0x00000000811fffff]
[    0.000000]   node   0: [mem 0x0000000081200000-0x00000000819fffff]
[    0.000000]   node   0: [mem 0x0000000081a00000-0x0000000081cf3fff]
[    0.000000]   node   0: [mem 0x0000000081cf4000-0x0000000081cfffff]
[    0.000000]   node   0: [mem 0x0000000081d00000-0x0000000081f1ffff]
[    0.000000]   node   0: [mem 0x0000000081f20000-0x00000000825fffff]
[    0.000000]   node   0: [mem 0x0000000082600000-0x0000000086dfffff]
[    0.000000]   node   0: [mem 0x0000000086e00000-0x000000008a7fffff]
[    0.000000]   node   0: [mem 0x000000008a800000-0x000000009b09bfff]
[    0.000000]   node   0: [mem 0x000000009b09c000-0x000000009b0fffff]
[    0.000000]   node   0: [mem 0x000000009b100000-0x00000000a2a7ffff]
[    0.000000]   node   0: [mem 0x00000000a2a80000-0x00000000d4a7ffff]
[    0.000000]   node   0: [mem 0x00000000d4a80000-0x00000000d7ffffff]
[    0.000000]   node   0: [mem 0x00000000d8140000-0x00000000d815ffff]
[    0.000000]   node   0: [mem 0x00000000e1bb0000-0x00000000e393ffff]
[    0.000000]   node   0: [mem 0x00000000e3940000-0x00000000e643ffff]
[    0.000000]   node   0: [mem 0x00000000e6440000-0x00000000e66b8fff]
[    0.000000]   node   0: [mem 0x00000000e66b9000-0x00000000f35fffff]
[    0.000000]   node   0: [mem 0x00000000f3600000-0x00000000f81fffff]
[    0.000000]   node   0: [mem 0x00000000f8200000-0x00000000f83fffff]
[    0.000000]   node   0: [mem 0x00000000f8400000-0x00000000fcd03fff]
[    0.000000]   node   0: [mem 0x00000000fcd04000-0x00000000fcdfffff]
[    0.000000]   node   0: [mem 0x00000000fce00000-0x00000000ff7fffff]
[    0.000000]   node   0: [mem 0x00000000ff800000-0x00000000ffffffff]
[    0.000000]   node   0: [mem 0x0000000880000000-0x00000008b99fffff]
[    0.000000]   node   0: [mem 0x00000008c0000000-0x0000000affffffff]
[    0.000000] Initmem setup node 0 [mem 0x0000000080000000-0x0000000affffffff]
[    0.000000] On node 0, zone DMA: 976 pages in unavailable ranges
[    0.000000] On node 0, zone DMA: 320 pages in unavailable ranges
[    0.000000] On node 0, zone DMA: 39504 pages in unavailable ranges
[    0.000000] On node 0, zone DMA: 26112 pages in unavailable ranges
[    0.000000] cma: Reserved 32 MiB at 0x0000000880000000 on node -1
[    0.000000] psci: probing for conduit method from DT.
[    0.000000] psci: PSCIv1.1 detected in firmware.
[    0.000000] psci: Using standard PSCI v0.2 function IDs
[    0.000000] psci: MIGRATE_INFO_TYPE not supported.
[    0.000000] psci: SMC Calling Convention v1.3
[    0.000000] psci: OSI mode supported.
[    0.000000] percpu: Embedded 25 pages/cpu s61656 r8192 d32552 u102400
[    0.000000] pcpu-alloc: s61656 r8192 d32552 u102400 alloc=25*4096
[    0.000000] pcpu-alloc: [0] 0 [0] 1 [0] 2 [0] 3 [0] 4 [0] 5 [0] 6 [0] 7
[    0.000000] Detected PIPT I-cache on CPU0
[    0.000000] CPU features: detected: Address authentication (architected QARMA5 algorithm)
[    0.000000] CPU features: detected: GIC system register CPU interface
[    0.000000] CPU features: detected: Spectre-v4
[    0.000000] CPU features: detected: ARM erratum 2457168
[    0.000000] CPU features: detected: ARM erratum 2658417
[    0.000000] CPU features: detected: ARM errata 2966298, 3117295
[    0.000000] alternatives: applying boot alternatives
[    0.000000] Kernel command line: earlycon root= allow_mismatched_32bit_el0 clk_ignore_unused pd_ignore_unused systemd.mask=rmtfs.service ignore_loglevel -- androidboot.verifiedbootstate=orange androidboot.keymaster=1  androidboot.bootdevice=1d84000.ufshc androidboot.fstab_suffix=default androidboot.boot_devices=soc/1d84000.ufshc androidboot.serialno=2a7a254f androidboot.baseband=apq androidboot.force_normal_boot=1
[    0.000000] Dentry cache hash table entries: 2097152 (order: 12, 16777216 bytes, linear)
[    0.000000] Inode-cache hash table entries: 1048576 (order: 11, 8388608 bytes, linear)
[    0.000000] Fallback order for Node 0: 0
[    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: 3078816
[    0.000000] Policy zone: DMA
[    0.000000] mem auto-init: stack:all(zero), heap alloc:off, heap free:off
[    0.000000] software IO TLB: SWIOTLB bounce buffer size adjusted to 11MB
[    0.000000] software IO TLB: area num 8.
[    0.000000] software IO TLB: SWIOTLB bounce buffer size roundup to 16MB
[    0.000000] software IO TLB: mapped [mem 0x0000000aeff38000-0x0000000af0f38000] (16MB)
[    0.000000] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=8, Nodes=1
[    0.000000] rcu: Preemptible hierarchical RCU implementation.
[    0.000000] rcu: 	RCU event tracing is enabled.
[    0.000000] rcu: 	RCU restricting CPUs from NR_CPUS=512 to nr_cpu_ids=8.
[    0.000000] 	Trampoline variant of Tasks RCU enabled.
[    0.000000] 	Tracing variant of Tasks RCU enabled.
[    0.000000] rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
[    0.000000] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=8
[    0.000000] RCU Tasks: Setting shift to 3 and lim to 1 rcu_task_cb_adjust=1 rcu_task_cpu_ids=8.
[    0.000000] RCU Tasks Trace: Setting shift to 3 and lim to 1 rcu_task_cb_adjust=1 rcu_task_cpu_ids=8.
[    0.000000] NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0
[    0.000000] GICv3: 988 SPIs implemented
[    0.000000] GICv3: 0 Extended SPIs implemented
[    0.000000] Root IRQ handler: gic_handle_irq
[    0.000000] GICv3: GICv3 features: 16 PPIs, DirectLPI
[    0.000000] GICv3: GICD_CTRL.DS=1, SCR_EL3.FIQ=0
[    0.000000] GICv3: Enabling SGIs without active state
[    0.000000] GICv3: CPU0: found redistributor 0 region 0:0x0000000017180000
[    0.000000] ITS [mem 0x17140000-0x1715ffff]
[    0.000000] ITS@0x0000000017140000: Devices Table too large, reduce ids 32->19
[    0.000000] ITS@0x0000000017140000: Devices too large, reduce ITS pages 1024->256
[    0.000000] ITS@0x0000000017140000: allocated 131072 Devices @81500000 (indirect, esz 8, psz 4K, shr 1)
[    0.000000] ITS@0x0000000017140000: allocated 4096 Interrupt Collections @8145b000 (flat, esz 1, psz 4K, shr 1)
[    0.000000] GICv3: using LPI property table @0x0000000081470000
[    0.000000] GICv3: CPU0: using allocated LPI pending table @0x0000000081480000
[    0.000000] rcu: srcu_init: Setting srcu_struct sizes based on contention.
[    0.000000] arch_timer: cp15 and mmio timer(s) running at 19.20MHz (virt/virt).
[    0.000000] clocksource: arch_sys_counter: mask: 0xffffffffffffff max_cycles: 0x46d987e47, max_idle_ns: 440795202767 ns
[    0.000000] sched_clock: 56 bits at 19MHz, resolution 52ns, wraps every 4398046511078ns
[    0.008355] arm-pv: using stolen time PV
[    0.012595] Console: colour dummy device 80x25
[    0.017182] printk: legacy console [tty0] enabled
[    0.022031] printk: legacy bootconsole [qcom_geni0] disabled
<----------------->

System handler takes over, I'll analyze the system firmware logs to understand which access made the crash.

I tested the "arm64: mm: fix DMA zone when dma-ranges is missing" patch at [1], but it doesn't help.

[1] https://lore.kernel.org/all/d8e92b14181fc815773ac4f4bba70b5d48bc390e.1724838684.git.baruch@tkos.co.il/

Neil

#regzbot introduced: ba0fb44aed47693cc2482427f63ba6cd19051327
Robin Murphy Aug. 29, 2024, 2:38 p.m. UTC | #11
On 2024-08-29 2:42 pm, Neil Armstrong wrote:
> Hi,
> 
> On 11/08/2024 09:09, Baruch Siach wrote:
>> From: Catalin Marinas <catalin.marinas@arm.com>
>>
>> Hardware DMA limit might not be power of 2. When RAM range starts above
>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
>> can not encode this limit.
>>
>> Use plain address for DMA zone limit.
>>
>> Since DMA zone can now potentially span beyond 4GB physical limit of
>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
>>
>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
>> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
>> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
>> ---
>>   arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
>>   arch/powerpc/mm/mem.c      |  5 ++++-
>>   arch/s390/mm/init.c        |  2 +-
>>   include/linux/dma-direct.h |  2 +-
>>   kernel/dma/direct.c        |  6 +++---
>>   kernel/dma/pool.c          |  4 ++--
>>   kernel/dma/swiotlb.c       |  6 +++---
>>   7 files changed, 29 insertions(+), 26 deletions(-)
>>
> 
> <snip>
> 
> This change breaks the Qualcomm SM8550-HDK boot since next-20240826.
> It doesn't affect SM8550-QRD or other similar SoCs like SM8650 or SM8450.
> The last CI run on next-20240828 can be found at:
> https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/pipelines/100936
> 
> SM8550-HDK boot log:
> https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/jobs/165617
> 
[...]

Yeah, a 35-bit ZONE_DMA is sure to make stuff go wrong:

> [    0.000000] Zone ranges:
> [    0.000000]   DMA      [mem 0x0000000080000000-0x0000000affffffff]
> [    0.000000]   DMA32    empty
> [    0.000000]   Normal   empty

Compared to before:

[    0.000000]   DMA      [mem 0x0000000080000000-0x00000000ffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   [mem 0x0000000100000000-0x0000000affffffff]

This'll be because the SoC DT is describing a general non-restrictive range:
		dma-ranges = <0 0 0 0 0x10 0>;

Which proves we need more information than 
{acpi,of}_dma_get_max_cpu_address() are currently able to give us, 
because what zone_dma_limit actually wants to be is the *minimum* of the 
lowest highest CPU address of any DMA range, and the lowest CPU address 
of any DMA range + 2^32. I was thinking it had all ended up looking a 
bit too easy... :)

I think v1 of the fix[1] might actually work out for this, albeit still 
for the wrong reasons - if so, I concede that maybe at this point it 
might be safest to go back to that one as a quick short-term fix (with a 
big fat comment to say so) rather than try to rush the proper solution 
or revert everything.

Thanks,
Robin.

[1] 
https://lore.kernel.org/linux-arm-kernel/731d204f5f556ad61bbaf004b1d984f83c90b4f5.1724748249.git.baruch@tkos.co.il/
Neil Armstrong Aug. 29, 2024, 2:54 p.m. UTC | #12
Hi Robin,

On 29/08/2024 16:38, Robin Murphy wrote:
> On 2024-08-29 2:42 pm, Neil Armstrong wrote:
>> Hi,
>>
>> On 11/08/2024 09:09, Baruch Siach wrote:
>>> From: Catalin Marinas <catalin.marinas@arm.com>
>>>
>>> Hardware DMA limit might not be power of 2. When RAM range starts above
>>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit
>>> can not encode this limit.
>>>
>>> Use plain address for DMA zone limit.
>>>
>>> Since DMA zone can now potentially span beyond 4GB physical limit of
>>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.
>>>
>>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
>>> Co-developed-by: Baruch Siach <baruch@tkos.co.il>
>>> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
>>> ---
>>>   arch/arm64/mm/init.c       | 30 +++++++++++++++---------------
>>>   arch/powerpc/mm/mem.c      |  5 ++++-
>>>   arch/s390/mm/init.c        |  2 +-
>>>   include/linux/dma-direct.h |  2 +-
>>>   kernel/dma/direct.c        |  6 +++---
>>>   kernel/dma/pool.c          |  4 ++--
>>>   kernel/dma/swiotlb.c       |  6 +++---
>>>   7 files changed, 29 insertions(+), 26 deletions(-)
>>>
>>
>> <snip>
>>
>> This change breaks the Qualcomm SM8550-HDK boot since next-20240826.
>> It doesn't affect SM8550-QRD or other similar SoCs like SM8650 or SM8450.
>> The last CI run on next-20240828 can be found at:
>> https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/pipelines/100936
>>
>> SM8550-HDK boot log:
>> https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/jobs/165617
>>
> [...]
> 
> Yeah, a 35-bit ZONE_DMA is sure to make stuff go wrong:
> 
>> [    0.000000] Zone ranges:
>> [    0.000000]   DMA      [mem 0x0000000080000000-0x0000000affffffff]
>> [    0.000000]   DMA32    empty
>> [    0.000000]   Normal   empty
> 
> Compared to before:
> 
> [    0.000000]   DMA      [mem 0x0000000080000000-0x00000000ffffffff]
> [    0.000000]   DMA32    empty
> [    0.000000]   Normal   [mem 0x0000000100000000-0x0000000affffffff]
> 
> This'll be because the SoC DT is describing a general non-restrictive range:
>          dma-ranges = <0 0 0 0 0x10 0>;
> 
> Which proves we need more information than {acpi,of}_dma_get_max_cpu_address() are currently able to give us, because what zone_dma_limit actually wants to be is the *minimum* of the lowest highest CPU address of any DMA range, and the lowest CPU address of any DMA range + 2^32. I was thinking it had all ended up looking a bit too easy... :)
> 
> I think v1 of the fix[1] might actually work out for this, albeit still for the wrong reasons - if so, I concede that maybe at this point it might be safest to go back to that one as a quick short-term fix (with a big fat comment to say so) rather than try to rush the proper solution or revert everything.

Indeed v1 patches makes boot work again:
[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000080000000-0x00000000ffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   [mem 0x0000000100000000-0x0000000affffffff]

Please add my:
Tested-by: Neil Armstrong <neil.armstrong@linaro.org> # on SM8550-HDK

Thanks,
Neil

> 
> Thanks,
> Robin.
> 
> [1] https://lore.kernel.org/linux-arm-kernel/731d204f5f556ad61bbaf004b1d984f83c90b4f5.1724748249.git.baruch@tkos.co.il/
diff mbox series

Patch

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 9b5ab6818f7f..c45e2152ca9e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -115,35 +115,35 @@  static void __init arch_reserve_crashkernel(void)
 }
 
 /*
- * Return the maximum physical address for a zone accessible by the given bits
- * limit. If DRAM starts above 32-bit, expand the zone to the maximum
+ * Return the maximum physical address for a zone given its limit.
+ * If DRAM starts above 32-bit, expand the zone to the maximum
  * available memory, otherwise cap it at 32-bit.
  */
-static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
+static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
 {
-	phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
 	phys_addr_t phys_start = memblock_start_of_DRAM();
 
 	if (phys_start > U32_MAX)
-		zone_mask = PHYS_ADDR_MAX;
-	else if (phys_start > zone_mask)
-		zone_mask = U32_MAX;
+		zone_limit = PHYS_ADDR_MAX;
+	else if (phys_start > zone_limit)
+		zone_limit = U32_MAX;
 
-	return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
+	return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
 }
 
 static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
-	unsigned int __maybe_unused acpi_zone_dma_bits;
-	unsigned int __maybe_unused dt_zone_dma_bits;
-	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
+	phys_addr_t __maybe_unused acpi_zone_dma_limit;
+	phys_addr_t __maybe_unused dt_zone_dma_limit;
+	phys_addr_t __maybe_unused dma32_phys_limit =
+		max_zone_phys(DMA_BIT_MASK(32));
 
 #ifdef CONFIG_ZONE_DMA
-	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
-	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
-	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
-	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
+	acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
+	dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
+	zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
+	arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
 	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
 #endif
 #ifdef CONFIG_ZONE_DMA32
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index d325217ab201..05b7f702b3f7 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -216,7 +216,7 @@  static int __init mark_nonram_nosave(void)
  * everything else. GFP_DMA32 page allocations automatically fall back to
  * ZONE_DMA.
  *
- * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
+ * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the
  * generic DMA mapping code.  32-bit only devices (if not handled by an IOMMU
  * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
  * ZONE_DMA.
@@ -230,6 +230,7 @@  void __init paging_init(void)
 {
 	unsigned long long total_ram = memblock_phys_mem_size();
 	phys_addr_t top_of_ram = memblock_end_of_DRAM();
+	int zone_dma_bits;
 
 #ifdef CONFIG_HIGHMEM
 	unsigned long v = __fix_to_virt(FIX_KMAP_END);
@@ -256,6 +257,8 @@  void __init paging_init(void)
 	else
 		zone_dma_bits = 31;
 
+	zone_dma_limit = DMA_BIT_MASK(zone_dma_bits);
+
 #ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA]	= min(max_low_pfn,
 				      1UL << (zone_dma_bits - PAGE_SHIFT));
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ddcd39ef4346..91fc2b91adfc 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -97,7 +97,7 @@  void __init paging_init(void)
 
 	vmem_map_init();
 	sparse_init();
-	zone_dma_bits = 31;
+	zone_dma_limit = DMA_BIT_MASK(31);
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS);
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index edbe13d00776..d7e30d4f7503 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -12,7 +12,7 @@ 
 #include <linux/mem_encrypt.h>
 #include <linux/swiotlb.h>
 
-extern unsigned int zone_dma_bits;
+extern u64 zone_dma_limit;
 
 /*
  * Record the mapping of CPU physical to DMA addresses for a given region.
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4480a3cd92e0..f2ba074a6a54 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -20,7 +20,7 @@ 
  * it for entirely different regions. In that case the arch code needs to
  * override the variable below for dma-direct to work properly.
  */
-unsigned int zone_dma_bits __ro_after_init = 24;
+u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
 
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 		phys_addr_t phys)
@@ -59,7 +59,7 @@  static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
 	 * zones.
 	 */
 	*phys_limit = dma_to_phys(dev, dma_limit);
-	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+	if (*phys_limit <= zone_dma_limit)
 		return GFP_DMA;
 	if (*phys_limit <= DMA_BIT_MASK(32))
 		return GFP_DMA32;
@@ -580,7 +580,7 @@  int dma_direct_supported(struct device *dev, u64 mask)
 	 * part of the check.
 	 */
 	if (IS_ENABLED(CONFIG_ZONE_DMA))
-		min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
+		min_mask = min_t(u64, min_mask, zone_dma_limit);
 	return mask >= phys_to_dma_unencrypted(dev, min_mask);
 }
 
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index d10613eb0f63..7b04f7575796 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -70,9 +70,9 @@  static bool cma_in_zone(gfp_t gfp)
 	/* CMA can't cross zone boundaries, see cma_activate_area() */
 	end = cma_get_base(cma) + size - 1;
 	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
-		return end <= DMA_BIT_MASK(zone_dma_bits);
+		return end <= zone_dma_limit;
 	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
-		return end <= DMA_BIT_MASK(32);
+		return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
 	return true;
 }
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index df68d29740a0..abcf3fa63a56 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -450,9 +450,9 @@  int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 	if (!remap)
 		io_tlb_default_mem.can_grow = true;
 	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
-		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits);
+		io_tlb_default_mem.phys_limit = zone_dma_limit;
 	else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
-		io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32);
+		io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
 	else
 		io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
 #endif
@@ -629,7 +629,7 @@  static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
 	}
 
 	gfp &= ~GFP_ZONEMASK;
-	if (phys_limit <= DMA_BIT_MASK(zone_dma_bits))
+	if (phys_limit <= zone_dma_limit)
 		gfp |= __GFP_DMA;
 	else if (phys_limit <= DMA_BIT_MASK(32))
 		gfp |= __GFP_DMA32;