Message ID | 17c067618b93e5d71f19c37826d54db4299621a3.1723359916.git.baruch@tkos.co.il (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | dma: support DMA zone starting above 4GB | expand |
On Sun, 11 Aug 2024 10:09:35 +0300 Baruch Siach <baruch@tkos.co.il> wrote: > From: Catalin Marinas <catalin.marinas@arm.com> > > Hardware DMA limit might not be power of 2. When RAM range starts above > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit > can not encode this limit. > > Use plain address for DMA zone limit. > > Since DMA zone can now potentially span beyond 4GB physical limit of > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. > > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> > Co-developed-by: Baruch Siach <baruch@tkos.co.il> > Signed-off-by: Baruch Siach <baruch@tkos.co.il> LGTM. Reviewed-by: Petr Tesarik <ptesarik@suse.com> Petr T > --- > arch/arm64/mm/init.c | 30 +++++++++++++++--------------- > arch/powerpc/mm/mem.c | 5 ++++- > arch/s390/mm/init.c | 2 +- > include/linux/dma-direct.h | 2 +- > kernel/dma/direct.c | 6 +++--- > kernel/dma/pool.c | 4 ++-- > kernel/dma/swiotlb.c | 6 +++--- > 7 files changed, 29 insertions(+), 26 deletions(-) > > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > index 9b5ab6818f7f..c45e2152ca9e 100644 > --- a/arch/arm64/mm/init.c > +++ b/arch/arm64/mm/init.c > @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) > } > > /* > - * Return the maximum physical address for a zone accessible by the given bits > - * limit. If DRAM starts above 32-bit, expand the zone to the maximum > + * Return the maximum physical address for a zone given its limit. > + * If DRAM starts above 32-bit, expand the zone to the maximum > * available memory, otherwise cap it at 32-bit. > */ > -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) > +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) > { > - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); > phys_addr_t phys_start = memblock_start_of_DRAM(); > > if (phys_start > U32_MAX) > - zone_mask = PHYS_ADDR_MAX; > - else if (phys_start > zone_mask) > - zone_mask = U32_MAX; > + zone_limit = PHYS_ADDR_MAX; > + else if (phys_start > zone_limit) > + zone_limit = U32_MAX; > > - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; > + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; > } > > static void __init zone_sizes_init(void) > { > unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; > - unsigned int __maybe_unused acpi_zone_dma_bits; > - unsigned int __maybe_unused dt_zone_dma_bits; > - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); > + phys_addr_t __maybe_unused acpi_zone_dma_limit; > + phys_addr_t __maybe_unused dt_zone_dma_limit; > + phys_addr_t __maybe_unused dma32_phys_limit = > + max_zone_phys(DMA_BIT_MASK(32)); > > #ifdef CONFIG_ZONE_DMA > - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); > - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); > - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); > - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); > + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); > + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); > + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); > + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); > max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); > #endif > #ifdef CONFIG_ZONE_DMA32 > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index d325217ab201..05b7f702b3f7 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void) > * everything else. GFP_DMA32 page allocations automatically fall back to > * ZONE_DMA. > * > - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the > + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the > * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU > * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by > * ZONE_DMA. > @@ -230,6 +230,7 @@ void __init paging_init(void) > { > unsigned long long total_ram = memblock_phys_mem_size(); > phys_addr_t top_of_ram = memblock_end_of_DRAM(); > + int zone_dma_bits; > > #ifdef CONFIG_HIGHMEM > unsigned long v = __fix_to_virt(FIX_KMAP_END); > @@ -256,6 +257,8 @@ void __init paging_init(void) > else > zone_dma_bits = 31; > > + zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); > + > #ifdef CONFIG_ZONE_DMA > max_zone_pfns[ZONE_DMA] = min(max_low_pfn, > 1UL << (zone_dma_bits - PAGE_SHIFT)); > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index ddcd39ef4346..91fc2b91adfc 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -97,7 +97,7 @@ void __init paging_init(void) > > vmem_map_init(); > sparse_init(); > - zone_dma_bits = 31; > + zone_dma_limit = DMA_BIT_MASK(31); > memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); > max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); > max_zone_pfns[ZONE_NORMAL] = max_low_pfn; > diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h > index edbe13d00776..d7e30d4f7503 100644 > --- a/include/linux/dma-direct.h > +++ b/include/linux/dma-direct.h > @@ -12,7 +12,7 @@ > #include <linux/mem_encrypt.h> > #include <linux/swiotlb.h> > > -extern unsigned int zone_dma_bits; > +extern u64 zone_dma_limit; > > /* > * Record the mapping of CPU physical to DMA addresses for a given region. > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c > index 4480a3cd92e0..f2ba074a6a54 100644 > --- a/kernel/dma/direct.c > +++ b/kernel/dma/direct.c > @@ -20,7 +20,7 @@ > * it for entirely different regions. In that case the arch code needs to > * override the variable below for dma-direct to work properly. > */ > -unsigned int zone_dma_bits __ro_after_init = 24; > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); > > static inline dma_addr_t phys_to_dma_direct(struct device *dev, > phys_addr_t phys) > @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) > * zones. > */ > *phys_limit = dma_to_phys(dev, dma_limit); > - if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) > + if (*phys_limit <= zone_dma_limit) > return GFP_DMA; > if (*phys_limit <= DMA_BIT_MASK(32)) > return GFP_DMA32; > @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask) > * part of the check. > */ > if (IS_ENABLED(CONFIG_ZONE_DMA)) > - min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); > + min_mask = min_t(u64, min_mask, zone_dma_limit); > return mask >= phys_to_dma_unencrypted(dev, min_mask); > } > > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c > index d10613eb0f63..7b04f7575796 100644 > --- a/kernel/dma/pool.c > +++ b/kernel/dma/pool.c > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) > /* CMA can't cross zone boundaries, see cma_activate_area() */ > end = cma_get_base(cma) + size - 1; > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) > - return end <= DMA_BIT_MASK(zone_dma_bits); > + return end <= zone_dma_limit; > if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) > - return end <= DMA_BIT_MASK(32); > + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); > return true; > } > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > index df68d29740a0..abcf3fa63a56 100644 > --- a/kernel/dma/swiotlb.c > +++ b/kernel/dma/swiotlb.c > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, > if (!remap) > io_tlb_default_mem.can_grow = true; > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); > + io_tlb_default_mem.phys_limit = zone_dma_limit; > else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); > + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); > else > io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); > #endif > @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, > } > > gfp &= ~GFP_ZONEMASK; > - if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) > + if (phys_limit <= zone_dma_limit) > gfp |= __GFP_DMA; > else if (phys_limit <= DMA_BIT_MASK(32)) > gfp |= __GFP_DMA32;
On Sun, Aug 11, 2024 at 10:09:35AM +0300, Baruch Siach wrote: > From: Catalin Marinas <catalin.marinas@arm.com> > > Hardware DMA limit might not be power of 2. When RAM range starts above > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit > can not encode this limit. > > Use plain address for DMA zone limit. > > Since DMA zone can now potentially span beyond 4GB physical limit of > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. > > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> > Co-developed-by: Baruch Siach <baruch@tkos.co.il> > Signed-off-by: Baruch Siach <baruch@tkos.co.il> You might want to say that no functional change is expected with this patch. The patch looks fine. Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
On Sun, Aug 11, 2024 at 10:09:35AM +0300, Baruch Siach wrote: > From: Catalin Marinas <catalin.marinas@arm.com> > > Hardware DMA limit might not be power of 2. When RAM range starts above > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit > can not encode this limit. > > Use plain address for DMA zone limit. > > Since DMA zone can now potentially span beyond 4GB physical limit of > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. > > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> > Co-developed-by: Baruch Siach <baruch@tkos.co.il> > Signed-off-by: Baruch Siach <baruch@tkos.co.il> > --- > arch/arm64/mm/init.c | 30 +++++++++++++++--------------- > arch/powerpc/mm/mem.c | 5 ++++- > arch/s390/mm/init.c | 2 +- > include/linux/dma-direct.h | 2 +- > kernel/dma/direct.c | 6 +++--- > kernel/dma/pool.c | 4 ++-- > kernel/dma/swiotlb.c | 6 +++--- > 7 files changed, 29 insertions(+), 26 deletions(-) > > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > index 9b5ab6818f7f..c45e2152ca9e 100644 > --- a/arch/arm64/mm/init.c > +++ b/arch/arm64/mm/init.c > @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) > } > > /* > - * Return the maximum physical address for a zone accessible by the given bits > - * limit. If DRAM starts above 32-bit, expand the zone to the maximum > + * Return the maximum physical address for a zone given its limit. > + * If DRAM starts above 32-bit, expand the zone to the maximum > * available memory, otherwise cap it at 32-bit. > */ > -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) > +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) > { > - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); > phys_addr_t phys_start = memblock_start_of_DRAM(); > > if (phys_start > U32_MAX) > - zone_mask = PHYS_ADDR_MAX; > - else if (phys_start > zone_mask) > - zone_mask = U32_MAX; > + zone_limit = PHYS_ADDR_MAX; > + else if (phys_start > zone_limit) > + zone_limit = U32_MAX; > > - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; > + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; Why do we need to adjust +-1 now that we're no longer using a mask? > } > > static void __init zone_sizes_init(void) > { > unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; > - unsigned int __maybe_unused acpi_zone_dma_bits; > - unsigned int __maybe_unused dt_zone_dma_bits; > - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); > + phys_addr_t __maybe_unused acpi_zone_dma_limit; > + phys_addr_t __maybe_unused dt_zone_dma_limit; > + phys_addr_t __maybe_unused dma32_phys_limit = > + max_zone_phys(DMA_BIT_MASK(32)); > > #ifdef CONFIG_ZONE_DMA > - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); > - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); > - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); > - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); > + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); > + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); > + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); > + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); > max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); > #endif Maybe move this block into a helper function so we can avoid three __maybe_unused variables? Will
On Fri, 16 Aug 2024 12:52:47 +0100 Will Deacon <will@kernel.org> wrote: > On Sun, Aug 11, 2024 at 10:09:35AM +0300, Baruch Siach wrote: > > From: Catalin Marinas <catalin.marinas@arm.com> > > > > Hardware DMA limit might not be power of 2. When RAM range starts above > > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit > > can not encode this limit. > > > > Use plain address for DMA zone limit. > > > > Since DMA zone can now potentially span beyond 4GB physical limit of > > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. > > > > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> > > Co-developed-by: Baruch Siach <baruch@tkos.co.il> > > Signed-off-by: Baruch Siach <baruch@tkos.co.il> > > --- > > arch/arm64/mm/init.c | 30 +++++++++++++++--------------- > > arch/powerpc/mm/mem.c | 5 ++++- > > arch/s390/mm/init.c | 2 +- > > include/linux/dma-direct.h | 2 +- > > kernel/dma/direct.c | 6 +++--- > > kernel/dma/pool.c | 4 ++-- > > kernel/dma/swiotlb.c | 6 +++--- > > 7 files changed, 29 insertions(+), 26 deletions(-) > > > > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > > index 9b5ab6818f7f..c45e2152ca9e 100644 > > --- a/arch/arm64/mm/init.c > > +++ b/arch/arm64/mm/init.c > > @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) > > } > > > > /* > > - * Return the maximum physical address for a zone accessible by the given bits > > - * limit. If DRAM starts above 32-bit, expand the zone to the maximum > > + * Return the maximum physical address for a zone given its limit. > > + * If DRAM starts above 32-bit, expand the zone to the maximum > > * available memory, otherwise cap it at 32-bit. > > */ > > -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) > > +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) > > { > > - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); > > phys_addr_t phys_start = memblock_start_of_DRAM(); > > > > if (phys_start > U32_MAX) > > - zone_mask = PHYS_ADDR_MAX; > > - else if (phys_start > zone_mask) > > - zone_mask = U32_MAX; > > + zone_limit = PHYS_ADDR_MAX; > > + else if (phys_start > zone_limit) > > + zone_limit = U32_MAX; > > > > - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; > > + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; > > Why do we need to adjust +-1 now that we're no longer using a mask? Subtracting 1 is needed to get the highest valid DRAM address so it can be compared to the highest address in the zone (zone_limit). Adding 1 is necessary to get the lowest address beyond the zone. AFAICT this is the right thing here: arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); max_zone_pfns[] max values are exclusive, i.e. the lowest PFN which is _not_ within the zone. It is also the right thing when arm64_dma_phys_limit is passed to dma_contiguous_reserve(). It would be subtly broken if phys_addr_t could be a 32-bit integer, but that's not possible on arm64. In short, LGTM. Petr T
Dear All, On 11.08.2024 09:09, Baruch Siach wrote: > From: Catalin Marinas <catalin.marinas@arm.com> > > Hardware DMA limit might not be power of 2. When RAM range starts above > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit > can not encode this limit. > > Use plain address for DMA zone limit. > > Since DMA zone can now potentially span beyond 4GB physical limit of > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. > > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> > Co-developed-by: Baruch Siach <baruch@tkos.co.il> > Signed-off-by: Baruch Siach <baruch@tkos.co.il> > --- This patch landed recently in linux-next as commit ba0fb44aed47 ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my tests I found that it introduces the following warning on ARM64/Rockchip based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts): ------------[ cut here ]------------ dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096 overflow (mask ffffffff, bus limit 0). WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308 Modules linked in: CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278 Hardware name: Hardkernel ODROID-M1 (DT) pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : swiotlb_map+0x2f0/0x308 lr : swiotlb_map+0x2f0/0x308 ... Call trace: swiotlb_map+0x2f0/0x308 dma_direct_map_sg+0x9c/0x2e4 __dma_map_sg_attrs+0x28/0x94 dma_map_sg_attrs+0x10/0x24 dw_mci_pre_dma_transfer+0xb8/0xf4 dw_mci_pre_req+0x50/0x68 mmc_blk_mq_issue_rq+0x3e0/0x964 mmc_mq_queue_rq+0x118/0x2b4 blk_mq_dispatch_rq_list+0x21c/0x714 __blk_mq_sched_dispatch_requests+0x490/0x58c blk_mq_sched_dispatch_requests+0x30/0x6c blk_mq_run_hw_queue+0x284/0x40c blk_mq_flush_plug_list.part.0+0x190/0x974 blk_mq_flush_plug_list+0x1c/0x2c __blk_flush_plug+0xe4/0x140 blk_finish_plug+0x38/0x4c __ext4_get_inode_loc+0x22c/0x654 __ext4_get_inode_loc_noinmem+0x40/0xa8 __ext4_iget+0x154/0xcc0 ext4_get_journal_inode+0x30/0x110 ext4_load_and_init_journal+0x9c/0xaf0 ext4_fill_super+0x1fec/0x2d90 get_tree_bdev+0x140/0x1d8 ext4_get_tree+0x18/0x24 vfs_get_tree+0x28/0xe8 path_mount+0x3e8/0xb7c init_mount+0x68/0xac do_mount_root+0x108/0x1dc mount_root_generic+0x100/0x330 mount_root+0x160/0x2d0 initrd_load+0x1f0/0x2a0 prepare_namespace+0x4c/0x29c kernel_init_freeable+0x4b4/0x50c kernel_init+0x20/0x1d8 ret_from_fork+0x10/0x20 irq event stamp: 1305682 hardirqs last enabled at (1305681): [<ffff8000800e332c>] console_unlock+0x124/0x130 hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c softirqs last enabled at (1305678): [<ffff80008005be1c>] handle_softirqs+0x4cc/0x4e4 softirqs last disabled at (1305665): [<ffff8000800105b0>] __do_softirq+0x14/0x20 ---[ end trace 0000000000000000 ]--- This "bus limit 0" seems to be a bit suspicious to me as well as the fact that swiotlb is used for the MMC DMA. I will investigate this further tomorrow. The board boots fine though. > arch/arm64/mm/init.c | 30 +++++++++++++++--------------- > arch/powerpc/mm/mem.c | 5 ++++- > arch/s390/mm/init.c | 2 +- > include/linux/dma-direct.h | 2 +- > kernel/dma/direct.c | 6 +++--- > kernel/dma/pool.c | 4 ++-- > kernel/dma/swiotlb.c | 6 +++--- > 7 files changed, 29 insertions(+), 26 deletions(-) > > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > index 9b5ab6818f7f..c45e2152ca9e 100644 > --- a/arch/arm64/mm/init.c > +++ b/arch/arm64/mm/init.c > @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) > } > > /* > - * Return the maximum physical address for a zone accessible by the given bits > - * limit. If DRAM starts above 32-bit, expand the zone to the maximum > + * Return the maximum physical address for a zone given its limit. > + * If DRAM starts above 32-bit, expand the zone to the maximum > * available memory, otherwise cap it at 32-bit. > */ > -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) > +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) > { > - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); > phys_addr_t phys_start = memblock_start_of_DRAM(); > > if (phys_start > U32_MAX) > - zone_mask = PHYS_ADDR_MAX; > - else if (phys_start > zone_mask) > - zone_mask = U32_MAX; > + zone_limit = PHYS_ADDR_MAX; > + else if (phys_start > zone_limit) > + zone_limit = U32_MAX; > > - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; > + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; > } > > static void __init zone_sizes_init(void) > { > unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; > - unsigned int __maybe_unused acpi_zone_dma_bits; > - unsigned int __maybe_unused dt_zone_dma_bits; > - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); > + phys_addr_t __maybe_unused acpi_zone_dma_limit; > + phys_addr_t __maybe_unused dt_zone_dma_limit; > + phys_addr_t __maybe_unused dma32_phys_limit = > + max_zone_phys(DMA_BIT_MASK(32)); > > #ifdef CONFIG_ZONE_DMA > - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); > - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); > - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); > - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); > + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); > + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); > + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); > + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); > max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); > #endif > #ifdef CONFIG_ZONE_DMA32 > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index d325217ab201..05b7f702b3f7 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void) > * everything else. GFP_DMA32 page allocations automatically fall back to > * ZONE_DMA. > * > - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the > + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the > * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU > * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by > * ZONE_DMA. > @@ -230,6 +230,7 @@ void __init paging_init(void) > { > unsigned long long total_ram = memblock_phys_mem_size(); > phys_addr_t top_of_ram = memblock_end_of_DRAM(); > + int zone_dma_bits; > > #ifdef CONFIG_HIGHMEM > unsigned long v = __fix_to_virt(FIX_KMAP_END); > @@ -256,6 +257,8 @@ void __init paging_init(void) > else > zone_dma_bits = 31; > > + zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); > + > #ifdef CONFIG_ZONE_DMA > max_zone_pfns[ZONE_DMA] = min(max_low_pfn, > 1UL << (zone_dma_bits - PAGE_SHIFT)); > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index ddcd39ef4346..91fc2b91adfc 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -97,7 +97,7 @@ void __init paging_init(void) > > vmem_map_init(); > sparse_init(); > - zone_dma_bits = 31; > + zone_dma_limit = DMA_BIT_MASK(31); > memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); > max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); > max_zone_pfns[ZONE_NORMAL] = max_low_pfn; > diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h > index edbe13d00776..d7e30d4f7503 100644 > --- a/include/linux/dma-direct.h > +++ b/include/linux/dma-direct.h > @@ -12,7 +12,7 @@ > #include <linux/mem_encrypt.h> > #include <linux/swiotlb.h> > > -extern unsigned int zone_dma_bits; > +extern u64 zone_dma_limit; > > /* > * Record the mapping of CPU physical to DMA addresses for a given region. > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c > index 4480a3cd92e0..f2ba074a6a54 100644 > --- a/kernel/dma/direct.c > +++ b/kernel/dma/direct.c > @@ -20,7 +20,7 @@ > * it for entirely different regions. In that case the arch code needs to > * override the variable below for dma-direct to work properly. > */ > -unsigned int zone_dma_bits __ro_after_init = 24; > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); > > static inline dma_addr_t phys_to_dma_direct(struct device *dev, > phys_addr_t phys) > @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) > * zones. > */ > *phys_limit = dma_to_phys(dev, dma_limit); > - if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) > + if (*phys_limit <= zone_dma_limit) > return GFP_DMA; > if (*phys_limit <= DMA_BIT_MASK(32)) > return GFP_DMA32; > @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask) > * part of the check. > */ > if (IS_ENABLED(CONFIG_ZONE_DMA)) > - min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); > + min_mask = min_t(u64, min_mask, zone_dma_limit); > return mask >= phys_to_dma_unencrypted(dev, min_mask); > } > > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c > index d10613eb0f63..7b04f7575796 100644 > --- a/kernel/dma/pool.c > +++ b/kernel/dma/pool.c > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) > /* CMA can't cross zone boundaries, see cma_activate_area() */ > end = cma_get_base(cma) + size - 1; > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) > - return end <= DMA_BIT_MASK(zone_dma_bits); > + return end <= zone_dma_limit; > if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) > - return end <= DMA_BIT_MASK(32); > + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); > return true; > } > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > index df68d29740a0..abcf3fa63a56 100644 > --- a/kernel/dma/swiotlb.c > +++ b/kernel/dma/swiotlb.c > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, > if (!remap) > io_tlb_default_mem.can_grow = true; > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); > + io_tlb_default_mem.phys_limit = zone_dma_limit; > else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); > + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); > else > io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); > #endif > @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, > } > > gfp &= ~GFP_ZONEMASK; > - if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) > + if (phys_limit <= zone_dma_limit) > gfp |= __GFP_DMA; > else if (phys_limit <= DMA_BIT_MASK(32)) > gfp |= __GFP_DMA32; Best regards
Hi Marek, Thanks for your report. On Mon, Aug 26 2024, Marek Szyprowski wrote: > On 11.08.2024 09:09, Baruch Siach wrote: >> From: Catalin Marinas <catalin.marinas@arm.com> >> >> Hardware DMA limit might not be power of 2. When RAM range starts above >> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit >> can not encode this limit. >> >> Use plain address for DMA zone limit. >> >> Since DMA zone can now potentially span beyond 4GB physical limit of >> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. >> >> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> >> Co-developed-by: Baruch Siach <baruch@tkos.co.il> >> Signed-off-by: Baruch Siach <baruch@tkos.co.il> >> --- > > This patch landed recently in linux-next as commit ba0fb44aed47 > ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my > tests I found that it introduces the following warning on ARM64/Rockchip > based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts): Does this warning go away if you revert both 3be9b846896d and ba0fb44aed47? Upstream rockchip DTs have no dma-ranges property. Is that the case for your platform as well? Can you share kernel report of DMA zones and swiotlb? On my platform I get: [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000800000000-0x000000083fffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal [mem 0x0000000840000000-0x0000000fffffffff] ... [ 0.000000] software IO TLB: area num 8. [ 0.000000] software IO TLB: mapped [mem 0x000000083be38000-0x000000083fe38000] (64MB) What do you get at your end? > ------------[ cut here ]------------ > dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096 > overflow (mask ffffffff, bus limit 0). > WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308 > Modules linked in: > CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278 > Hardware name: Hardkernel ODROID-M1 (DT) > pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) > pc : swiotlb_map+0x2f0/0x308 > lr : swiotlb_map+0x2f0/0x308 > ... > Call trace: > swiotlb_map+0x2f0/0x308 > dma_direct_map_sg+0x9c/0x2e4 > __dma_map_sg_attrs+0x28/0x94 > dma_map_sg_attrs+0x10/0x24 > dw_mci_pre_dma_transfer+0xb8/0xf4 > dw_mci_pre_req+0x50/0x68 > mmc_blk_mq_issue_rq+0x3e0/0x964 > mmc_mq_queue_rq+0x118/0x2b4 > blk_mq_dispatch_rq_list+0x21c/0x714 > __blk_mq_sched_dispatch_requests+0x490/0x58c > blk_mq_sched_dispatch_requests+0x30/0x6c > blk_mq_run_hw_queue+0x284/0x40c > blk_mq_flush_plug_list.part.0+0x190/0x974 > blk_mq_flush_plug_list+0x1c/0x2c > __blk_flush_plug+0xe4/0x140 > blk_finish_plug+0x38/0x4c > __ext4_get_inode_loc+0x22c/0x654 > __ext4_get_inode_loc_noinmem+0x40/0xa8 > __ext4_iget+0x154/0xcc0 > ext4_get_journal_inode+0x30/0x110 > ext4_load_and_init_journal+0x9c/0xaf0 > ext4_fill_super+0x1fec/0x2d90 > get_tree_bdev+0x140/0x1d8 > ext4_get_tree+0x18/0x24 > vfs_get_tree+0x28/0xe8 > path_mount+0x3e8/0xb7c > init_mount+0x68/0xac > do_mount_root+0x108/0x1dc > mount_root_generic+0x100/0x330 > mount_root+0x160/0x2d0 > initrd_load+0x1f0/0x2a0 > prepare_namespace+0x4c/0x29c > kernel_init_freeable+0x4b4/0x50c > kernel_init+0x20/0x1d8 > ret_from_fork+0x10/0x20 > irq event stamp: 1305682 > hardirqs last enabled at (1305681): [<ffff8000800e332c>] > console_unlock+0x124/0x130 > hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c > softirqs last enabled at (1305678): [<ffff80008005be1c>] > handle_softirqs+0x4cc/0x4e4 > softirqs last disabled at (1305665): [<ffff8000800105b0>] > __do_softirq+0x14/0x20 > ---[ end trace 0000000000000000 ]--- > > This "bus limit 0" seems to be a bit suspicious to me as well as the > fact that swiotlb is used for the MMC DMA. I will investigate this > further tomorrow. The board boots fine though. Looking at the code I guess that bus_dma_limit set to 0 means no bus limit. But dma_mask for your device indicates 32-bit device limit. This can't work with address above 4GB. For some reason DMA code tries to allocate from higher address. This is most likely the reason dma_capable() returns false. Thanks, baruch >> arch/arm64/mm/init.c | 30 +++++++++++++++--------------- >> arch/powerpc/mm/mem.c | 5 ++++- >> arch/s390/mm/init.c | 2 +- >> include/linux/dma-direct.h | 2 +- >> kernel/dma/direct.c | 6 +++--- >> kernel/dma/pool.c | 4 ++-- >> kernel/dma/swiotlb.c | 6 +++--- >> 7 files changed, 29 insertions(+), 26 deletions(-) >> >> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c >> index 9b5ab6818f7f..c45e2152ca9e 100644 >> --- a/arch/arm64/mm/init.c >> +++ b/arch/arm64/mm/init.c >> @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) >> } >> >> /* >> - * Return the maximum physical address for a zone accessible by the given bits >> - * limit. If DRAM starts above 32-bit, expand the zone to the maximum >> + * Return the maximum physical address for a zone given its limit. >> + * If DRAM starts above 32-bit, expand the zone to the maximum >> * available memory, otherwise cap it at 32-bit. >> */ >> -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) >> +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) >> { >> - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); >> phys_addr_t phys_start = memblock_start_of_DRAM(); >> >> if (phys_start > U32_MAX) >> - zone_mask = PHYS_ADDR_MAX; >> - else if (phys_start > zone_mask) >> - zone_mask = U32_MAX; >> + zone_limit = PHYS_ADDR_MAX; >> + else if (phys_start > zone_limit) >> + zone_limit = U32_MAX; >> >> - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; >> + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; >> } >> >> static void __init zone_sizes_init(void) >> { >> unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; >> - unsigned int __maybe_unused acpi_zone_dma_bits; >> - unsigned int __maybe_unused dt_zone_dma_bits; >> - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); >> + phys_addr_t __maybe_unused acpi_zone_dma_limit; >> + phys_addr_t __maybe_unused dt_zone_dma_limit; >> + phys_addr_t __maybe_unused dma32_phys_limit = >> + max_zone_phys(DMA_BIT_MASK(32)); >> >> #ifdef CONFIG_ZONE_DMA >> - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); >> - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); >> - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); >> - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); >> + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); >> + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); >> + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); >> + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); >> max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); >> #endif >> #ifdef CONFIG_ZONE_DMA32 >> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c >> index d325217ab201..05b7f702b3f7 100644 >> --- a/arch/powerpc/mm/mem.c >> +++ b/arch/powerpc/mm/mem.c >> @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void) >> * everything else. GFP_DMA32 page allocations automatically fall back to >> * ZONE_DMA. >> * >> - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the >> + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the >> * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU >> * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by >> * ZONE_DMA. >> @@ -230,6 +230,7 @@ void __init paging_init(void) >> { >> unsigned long long total_ram = memblock_phys_mem_size(); >> phys_addr_t top_of_ram = memblock_end_of_DRAM(); >> + int zone_dma_bits; >> >> #ifdef CONFIG_HIGHMEM >> unsigned long v = __fix_to_virt(FIX_KMAP_END); >> @@ -256,6 +257,8 @@ void __init paging_init(void) >> else >> zone_dma_bits = 31; >> >> + zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); >> + >> #ifdef CONFIG_ZONE_DMA >> max_zone_pfns[ZONE_DMA] = min(max_low_pfn, >> 1UL << (zone_dma_bits - PAGE_SHIFT)); >> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c >> index ddcd39ef4346..91fc2b91adfc 100644 >> --- a/arch/s390/mm/init.c >> +++ b/arch/s390/mm/init.c >> @@ -97,7 +97,7 @@ void __init paging_init(void) >> >> vmem_map_init(); >> sparse_init(); >> - zone_dma_bits = 31; >> + zone_dma_limit = DMA_BIT_MASK(31); >> memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); >> max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); >> max_zone_pfns[ZONE_NORMAL] = max_low_pfn; >> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h >> index edbe13d00776..d7e30d4f7503 100644 >> --- a/include/linux/dma-direct.h >> +++ b/include/linux/dma-direct.h >> @@ -12,7 +12,7 @@ >> #include <linux/mem_encrypt.h> >> #include <linux/swiotlb.h> >> >> -extern unsigned int zone_dma_bits; >> +extern u64 zone_dma_limit; >> >> /* >> * Record the mapping of CPU physical to DMA addresses for a given region. >> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c >> index 4480a3cd92e0..f2ba074a6a54 100644 >> --- a/kernel/dma/direct.c >> +++ b/kernel/dma/direct.c >> @@ -20,7 +20,7 @@ >> * it for entirely different regions. In that case the arch code needs to >> * override the variable below for dma-direct to work properly. >> */ >> -unsigned int zone_dma_bits __ro_after_init = 24; >> +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); >> >> static inline dma_addr_t phys_to_dma_direct(struct device *dev, >> phys_addr_t phys) >> @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) >> * zones. >> */ >> *phys_limit = dma_to_phys(dev, dma_limit); >> - if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) >> + if (*phys_limit <= zone_dma_limit) >> return GFP_DMA; >> if (*phys_limit <= DMA_BIT_MASK(32)) >> return GFP_DMA32; >> @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask) >> * part of the check. >> */ >> if (IS_ENABLED(CONFIG_ZONE_DMA)) >> - min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); >> + min_mask = min_t(u64, min_mask, zone_dma_limit); >> return mask >= phys_to_dma_unencrypted(dev, min_mask); >> } >> >> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c >> index d10613eb0f63..7b04f7575796 100644 >> --- a/kernel/dma/pool.c >> +++ b/kernel/dma/pool.c >> @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) >> /* CMA can't cross zone boundaries, see cma_activate_area() */ >> end = cma_get_base(cma) + size - 1; >> if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) >> - return end <= DMA_BIT_MASK(zone_dma_bits); >> + return end <= zone_dma_limit; >> if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) >> - return end <= DMA_BIT_MASK(32); >> + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); >> return true; >> } >> >> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c >> index df68d29740a0..abcf3fa63a56 100644 >> --- a/kernel/dma/swiotlb.c >> +++ b/kernel/dma/swiotlb.c >> @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, >> if (!remap) >> io_tlb_default_mem.can_grow = true; >> if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) >> - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); >> + io_tlb_default_mem.phys_limit = zone_dma_limit; >> else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) >> - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); >> + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); >> else >> io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); >> #endif >> @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, >> } >> >> gfp &= ~GFP_ZONEMASK; >> - if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) >> + if (phys_limit <= zone_dma_limit) >> gfp |= __GFP_DMA; >> else if (phys_limit <= DMA_BIT_MASK(32)) >> gfp |= __GFP_DMA32; > > Best regards
On 27.08.2024 06:52, Baruch Siach wrote: > Hi Marek, > > Thanks for your report. > > On Mon, Aug 26 2024, Marek Szyprowski wrote: >> On 11.08.2024 09:09, Baruch Siach wrote: >>> From: Catalin Marinas <catalin.marinas@arm.com> >>> >>> Hardware DMA limit might not be power of 2. When RAM range starts above >>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit >>> can not encode this limit. >>> >>> Use plain address for DMA zone limit. >>> >>> Since DMA zone can now potentially span beyond 4GB physical limit of >>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. >>> >>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> >>> Co-developed-by: Baruch Siach <baruch@tkos.co.il> >>> Signed-off-by: Baruch Siach <baruch@tkos.co.il> >>> --- >> This patch landed recently in linux-next as commit ba0fb44aed47 >> ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my >> tests I found that it introduces the following warning on ARM64/Rockchip >> based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts): > Does this warning go away if you revert both 3be9b846896d and ba0fb44aed47? Yes, linux-next with above mentioned commits reverted works fine. > Upstream rockchip DTs have no dma-ranges property. Is that the case for > your platform as well? > > Can you share kernel report of DMA zones and swiotlb? On my platform I get: > > [ 0.000000] Zone ranges: > [ 0.000000] DMA [mem 0x0000000800000000-0x000000083fffffff] > [ 0.000000] DMA32 empty > [ 0.000000] Normal [mem 0x0000000840000000-0x0000000fffffffff] > ... > [ 0.000000] software IO TLB: area num 8. > [ 0.000000] software IO TLB: mapped [mem 0x000000083be38000-0x000000083fe38000] (64MB) > > What do you get at your end? On ba0fb44aed47 I got: [ 0.000000] NUMA: No NUMA configuration found [ 0.000000] NUMA: Faking a node at [mem 0x0000000000200000-0x00000001ffffffff] [ 0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff] [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000000200000-0x00000001ffffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal empty [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000000200000-0x00000000083fffff] [ 0.000000] node 0: [mem 0x0000000009400000-0x00000000efffffff] [ 0.000000] node 0: [mem 0x00000001f0000000-0x00000001ffffffff] [ 0.000000] Initmem setup node 0 [mem 0x0000000000200000-0x00000001ffffffff] [ 0.000000] On node 0, zone DMA: 512 pages in unavailable ranges [ 0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges [ 0.000000] cma: Reserved 96 MiB at 0x00000001f0000000 on node -1 ... [ 0.000000] software IO TLB: SWIOTLB bounce buffer size adjusted to 3MB [ 0.000000] software IO TLB: area num 4. [ 0.000000] software IO TLB: mapped [mem 0x00000001fac00000-0x00000001fb000000] (4MB) On the fa3c109a6d30 (parent commit of the $subject) I got: [ 0.000000] NUMA: No NUMA configuration found [ 0.000000] NUMA: Faking a node at [mem 0x0000000000200000-0x00000001ffffffff] [ 0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff] [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000000200000-0x00000000ffffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal [mem 0x0000000100000000-0x00000001ffffffff] [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000000200000-0x00000000083fffff] [ 0.000000] node 0: [mem 0x0000000009400000-0x00000000efffffff] [ 0.000000] node 0: [mem 0x00000001f0000000-0x00000001ffffffff] [ 0.000000] Initmem setup node 0 [mem 0x0000000000200000-0x00000001ffffffff] [ 0.000000] On node 0, zone DMA: 512 pages in unavailable ranges [ 0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges [ 0.000000] cma: Reserved 96 MiB at 0x00000000ea000000 on node -1 ... [ 0.000000] software IO TLB: area num 4. [ 0.000000] software IO TLB: mapped [mem 0x00000000e6000000-0x00000000ea000000] (64MB) It looks that for some reasons $subject patch changes the default zone and swiotlb configuration. >> ------------[ cut here ]------------ >> dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096 >> overflow (mask ffffffff, bus limit 0). >> WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308 >> Modules linked in: >> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278 >> Hardware name: Hardkernel ODROID-M1 (DT) >> pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) >> pc : swiotlb_map+0x2f0/0x308 >> lr : swiotlb_map+0x2f0/0x308 >> ... >> Call trace: >> swiotlb_map+0x2f0/0x308 >> dma_direct_map_sg+0x9c/0x2e4 >> __dma_map_sg_attrs+0x28/0x94 >> dma_map_sg_attrs+0x10/0x24 >> dw_mci_pre_dma_transfer+0xb8/0xf4 >> dw_mci_pre_req+0x50/0x68 >> mmc_blk_mq_issue_rq+0x3e0/0x964 >> mmc_mq_queue_rq+0x118/0x2b4 >> blk_mq_dispatch_rq_list+0x21c/0x714 >> __blk_mq_sched_dispatch_requests+0x490/0x58c >> blk_mq_sched_dispatch_requests+0x30/0x6c >> blk_mq_run_hw_queue+0x284/0x40c >> blk_mq_flush_plug_list.part.0+0x190/0x974 >> blk_mq_flush_plug_list+0x1c/0x2c >> __blk_flush_plug+0xe4/0x140 >> blk_finish_plug+0x38/0x4c >> __ext4_get_inode_loc+0x22c/0x654 >> __ext4_get_inode_loc_noinmem+0x40/0xa8 >> __ext4_iget+0x154/0xcc0 >> ext4_get_journal_inode+0x30/0x110 >> ext4_load_and_init_journal+0x9c/0xaf0 >> ext4_fill_super+0x1fec/0x2d90 >> get_tree_bdev+0x140/0x1d8 >> ext4_get_tree+0x18/0x24 >> vfs_get_tree+0x28/0xe8 >> path_mount+0x3e8/0xb7c >> init_mount+0x68/0xac >> do_mount_root+0x108/0x1dc >> mount_root_generic+0x100/0x330 >> mount_root+0x160/0x2d0 >> initrd_load+0x1f0/0x2a0 >> prepare_namespace+0x4c/0x29c >> kernel_init_freeable+0x4b4/0x50c >> kernel_init+0x20/0x1d8 >> ret_from_fork+0x10/0x20 >> irq event stamp: 1305682 >> hardirqs last enabled at (1305681): [<ffff8000800e332c>] >> console_unlock+0x124/0x130 >> hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c >> softirqs last enabled at (1305678): [<ffff80008005be1c>] >> handle_softirqs+0x4cc/0x4e4 >> softirqs last disabled at (1305665): [<ffff8000800105b0>] >> __do_softirq+0x14/0x20 >> ---[ end trace 0000000000000000 ]--- >> >> This "bus limit 0" seems to be a bit suspicious to me as well as the >> fact that swiotlb is used for the MMC DMA. I will investigate this >> further tomorrow. The board boots fine though. > Looking at the code I guess that bus_dma_limit set to 0 means no bus > limit. But dma_mask for your device indicates 32-bit device limit. This > can't work with address above 4GB. For some reason DMA code tries to > allocate from higher address. This is most likely the reason > dma_capable() returns false. Indeed this looks like a source of the problem: [ 3.123618] Synopsys Designware Multimedia Card Interface Driver [ 3.139653] dwmmc_rockchip fe2b0000.mmc: IDMAC supports 32-bit address mode. [ 3.147739] dwmmc_rockchip fe2b0000.mmc: Using internal DMA controller. [ 3.161659] dwmmc_rockchip fe2b0000.mmc: Version ID is 270a [ 3.168455] dwmmc_rockchip fe2b0000.mmc: DW MMC controller at irq 56,32 bit host data width,256 deep fifo [ 3.182651] dwmmc_rockchip fe2b0000.mmc: Got CD GPIO ... [ 11.009258] ------------[ cut here ]------------ [ 11.014762] dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096 overflow (mask ffffffff, bus limit 0). > ... Best regards
Hi Marek, On Tue, Aug 27 2024, Marek Szyprowski wrote: > On 27.08.2024 06:52, Baruch Siach wrote: >> Hi Marek, >> >> Thanks for your report. >> >> On Mon, Aug 26 2024, Marek Szyprowski wrote: >>> On 11.08.2024 09:09, Baruch Siach wrote: >>>> From: Catalin Marinas <catalin.marinas@arm.com> >>>> >>>> Hardware DMA limit might not be power of 2. When RAM range starts above >>>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit >>>> can not encode this limit. >>>> >>>> Use plain address for DMA zone limit. >>>> >>>> Since DMA zone can now potentially span beyond 4GB physical limit of >>>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. >>>> >>>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> >>>> Co-developed-by: Baruch Siach <baruch@tkos.co.il> >>>> Signed-off-by: Baruch Siach <baruch@tkos.co.il> >>>> --- >>> This patch landed recently in linux-next as commit ba0fb44aed47 >>> ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my >>> tests I found that it introduces the following warning on ARM64/Rockchip >>> based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts): >> Does this warning go away if you revert both 3be9b846896d and ba0fb44aed47? > > Yes, linux-next with above mentioned commits reverted works fine. > > >> Upstream rockchip DTs have no dma-ranges property. Is that the case for >> your platform as well? >> >> Can you share kernel report of DMA zones and swiotlb? On my platform I get: >> >> [ 0.000000] Zone ranges: >> [ 0.000000] DMA [mem 0x0000000800000000-0x000000083fffffff] >> [ 0.000000] DMA32 empty >> [ 0.000000] Normal [mem 0x0000000840000000-0x0000000fffffffff] >> ... >> [ 0.000000] software IO TLB: area num 8. >> [ 0.000000] software IO TLB: mapped [mem 0x000000083be38000-0x000000083fe38000] (64MB) >> >> What do you get at your end? > > On ba0fb44aed47 I got: > > [ 0.000000] NUMA: No NUMA configuration found > [ 0.000000] NUMA: Faking a node at [mem > 0x0000000000200000-0x00000001ffffffff] > [ 0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff] > [ 0.000000] Zone ranges: > [ 0.000000] DMA [mem 0x0000000000200000-0x00000001ffffffff] > [ 0.000000] DMA32 empty > [ 0.000000] Normal empty > [ 0.000000] Movable zone start for each node > [ 0.000000] Early memory node ranges > [ 0.000000] node 0: [mem 0x0000000000200000-0x00000000083fffff] > [ 0.000000] node 0: [mem 0x0000000009400000-0x00000000efffffff] > [ 0.000000] node 0: [mem 0x00000001f0000000-0x00000001ffffffff] > [ 0.000000] Initmem setup node 0 [mem > 0x0000000000200000-0x00000001ffffffff] > [ 0.000000] On node 0, zone DMA: 512 pages in unavailable ranges > [ 0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges > [ 0.000000] cma: Reserved 96 MiB at 0x00000001f0000000 on node -1 > > ... > > [ 0.000000] software IO TLB: SWIOTLB bounce buffer size adjusted to 3MB > [ 0.000000] software IO TLB: area num 4. > [ 0.000000] software IO TLB: mapped [mem > 0x00000001fac00000-0x00000001fb000000] (4MB) > > On the fa3c109a6d30 (parent commit of the $subject) I got: > > [ 0.000000] NUMA: No NUMA configuration found > [ 0.000000] NUMA: Faking a node at [mem > 0x0000000000200000-0x00000001ffffffff] > [ 0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff] > [ 0.000000] Zone ranges: > [ 0.000000] DMA [mem 0x0000000000200000-0x00000000ffffffff] > [ 0.000000] DMA32 empty > [ 0.000000] Normal [mem 0x0000000100000000-0x00000001ffffffff] > [ 0.000000] Movable zone start for each node > [ 0.000000] Early memory node ranges > [ 0.000000] node 0: [mem 0x0000000000200000-0x00000000083fffff] > [ 0.000000] node 0: [mem 0x0000000009400000-0x00000000efffffff] > [ 0.000000] node 0: [mem 0x00000001f0000000-0x00000001ffffffff] > [ 0.000000] Initmem setup node 0 [mem > 0x0000000000200000-0x00000001ffffffff] > [ 0.000000] On node 0, zone DMA: 512 pages in unavailable ranges > [ 0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges > [ 0.000000] cma: Reserved 96 MiB at 0x00000000ea000000 on node -1 > > ... > > [ 0.000000] software IO TLB: area num 4. > [ 0.000000] software IO TLB: mapped [mem > 0x00000000e6000000-0x00000000ea000000] (64MB) > > It looks that for some reasons $subject patch changes the default zone > and swiotlb configuration. Does this fix the issue? diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index bfb10969cbf0..7fcd0aaa9bb6 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -116,6 +116,9 @@ static void __init arch_reserve_crashkernel(void) static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) { + if (memblock_start_of_DRAM() < U32_MAX) + zone_limit = min(zone_limit, U32_MAX); + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; } Thanks, baruch >>> ------------[ cut here ]------------ >>> dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096 >>> overflow (mask ffffffff, bus limit 0). >>> WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308 >>> Modules linked in: >>> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278 >>> Hardware name: Hardkernel ODROID-M1 (DT) >>> pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) >>> pc : swiotlb_map+0x2f0/0x308 >>> lr : swiotlb_map+0x2f0/0x308 >>> ... >>> Call trace: >>> swiotlb_map+0x2f0/0x308 >>> dma_direct_map_sg+0x9c/0x2e4 >>> __dma_map_sg_attrs+0x28/0x94 >>> dma_map_sg_attrs+0x10/0x24 >>> dw_mci_pre_dma_transfer+0xb8/0xf4 >>> dw_mci_pre_req+0x50/0x68 >>> mmc_blk_mq_issue_rq+0x3e0/0x964 >>> mmc_mq_queue_rq+0x118/0x2b4 >>> blk_mq_dispatch_rq_list+0x21c/0x714 >>> __blk_mq_sched_dispatch_requests+0x490/0x58c >>> blk_mq_sched_dispatch_requests+0x30/0x6c >>> blk_mq_run_hw_queue+0x284/0x40c >>> blk_mq_flush_plug_list.part.0+0x190/0x974 >>> blk_mq_flush_plug_list+0x1c/0x2c >>> __blk_flush_plug+0xe4/0x140 >>> blk_finish_plug+0x38/0x4c >>> __ext4_get_inode_loc+0x22c/0x654 >>> __ext4_get_inode_loc_noinmem+0x40/0xa8 >>> __ext4_iget+0x154/0xcc0 >>> ext4_get_journal_inode+0x30/0x110 >>> ext4_load_and_init_journal+0x9c/0xaf0 >>> ext4_fill_super+0x1fec/0x2d90 >>> get_tree_bdev+0x140/0x1d8 >>> ext4_get_tree+0x18/0x24 >>> vfs_get_tree+0x28/0xe8 >>> path_mount+0x3e8/0xb7c >>> init_mount+0x68/0xac >>> do_mount_root+0x108/0x1dc >>> mount_root_generic+0x100/0x330 >>> mount_root+0x160/0x2d0 >>> initrd_load+0x1f0/0x2a0 >>> prepare_namespace+0x4c/0x29c >>> kernel_init_freeable+0x4b4/0x50c >>> kernel_init+0x20/0x1d8 >>> ret_from_fork+0x10/0x20 >>> irq event stamp: 1305682 >>> hardirqs last enabled at (1305681): [<ffff8000800e332c>] >>> console_unlock+0x124/0x130 >>> hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c >>> softirqs last enabled at (1305678): [<ffff80008005be1c>] >>> handle_softirqs+0x4cc/0x4e4 >>> softirqs last disabled at (1305665): [<ffff8000800105b0>] >>> __do_softirq+0x14/0x20 >>> ---[ end trace 0000000000000000 ]--- >>> >>> This "bus limit 0" seems to be a bit suspicious to me as well as the >>> fact that swiotlb is used for the MMC DMA. I will investigate this >>> further tomorrow. The board boots fine though. >> Looking at the code I guess that bus_dma_limit set to 0 means no bus >> limit. But dma_mask for your device indicates 32-bit device limit. This >> can't work with address above 4GB. For some reason DMA code tries to >> allocate from higher address. This is most likely the reason >> dma_capable() returns false. > > Indeed this looks like a source of the problem: > > [ 3.123618] Synopsys Designware Multimedia Card Interface Driver > [ 3.139653] dwmmc_rockchip fe2b0000.mmc: IDMAC supports 32-bit > address mode. > [ 3.147739] dwmmc_rockchip fe2b0000.mmc: Using internal DMA controller. > [ 3.161659] dwmmc_rockchip fe2b0000.mmc: Version ID is 270a > [ 3.168455] dwmmc_rockchip fe2b0000.mmc: DW MMC controller at irq > 56,32 bit host data width,256 deep fifo > [ 3.182651] dwmmc_rockchip fe2b0000.mmc: Got CD GPIO > > ... > > [ 11.009258] ------------[ cut here ]------------ > [ 11.014762] dwmmc_rockchip fe2b0000.mmc: swiotlb addr > 0x00000001faf00000+4096 overflow (mask ffffffff, bus limit 0). > > >> ... > > Best regards
On 27.08.2024 09:03, Baruch Siach wrote: > On Tue, Aug 27 2024, Marek Szyprowski wrote: >> On 27.08.2024 06:52, Baruch Siach wrote: >>> Hi Marek, >>> >>> Thanks for your report. >>> >>> On Mon, Aug 26 2024, Marek Szyprowski wrote: >>>> On 11.08.2024 09:09, Baruch Siach wrote: >>>>> From: Catalin Marinas <catalin.marinas@arm.com> >>>>> >>>>> Hardware DMA limit might not be power of 2. When RAM range starts above >>>>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit >>>>> can not encode this limit. >>>>> >>>>> Use plain address for DMA zone limit. >>>>> >>>>> Since DMA zone can now potentially span beyond 4GB physical limit of >>>>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. >>>>> >>>>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> >>>>> Co-developed-by: Baruch Siach <baruch@tkos.co.il> >>>>> Signed-off-by: Baruch Siach <baruch@tkos.co.il> >>>>> --- >>>> This patch landed recently in linux-next as commit ba0fb44aed47 >>>> ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my >>>> tests I found that it introduces the following warning on ARM64/Rockchip >>>> based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts): >>> Does this warning go away if you revert both 3be9b846896d and ba0fb44aed47? >> Yes, linux-next with above mentioned commits reverted works fine. >> >> >>> Upstream rockchip DTs have no dma-ranges property. Is that the case for >>> your platform as well? >>> >>> Can you share kernel report of DMA zones and swiotlb? On my platform I get: >>> >>> [ 0.000000] Zone ranges: >>> [ 0.000000] DMA [mem 0x0000000800000000-0x000000083fffffff] >>> [ 0.000000] DMA32 empty >>> [ 0.000000] Normal [mem 0x0000000840000000-0x0000000fffffffff] >>> ... >>> [ 0.000000] software IO TLB: area num 8. >>> [ 0.000000] software IO TLB: mapped [mem 0x000000083be38000-0x000000083fe38000] (64MB) >>> >>> What do you get at your end? >> On ba0fb44aed47 I got: >> >> [ 0.000000] NUMA: No NUMA configuration found >> [ 0.000000] NUMA: Faking a node at [mem >> 0x0000000000200000-0x00000001ffffffff] >> [ 0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff] >> [ 0.000000] Zone ranges: >> [ 0.000000] DMA [mem 0x0000000000200000-0x00000001ffffffff] >> [ 0.000000] DMA32 empty >> [ 0.000000] Normal empty >> [ 0.000000] Movable zone start for each node >> [ 0.000000] Early memory node ranges >> [ 0.000000] node 0: [mem 0x0000000000200000-0x00000000083fffff] >> [ 0.000000] node 0: [mem 0x0000000009400000-0x00000000efffffff] >> [ 0.000000] node 0: [mem 0x00000001f0000000-0x00000001ffffffff] >> [ 0.000000] Initmem setup node 0 [mem >> 0x0000000000200000-0x00000001ffffffff] >> [ 0.000000] On node 0, zone DMA: 512 pages in unavailable ranges >> [ 0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges >> [ 0.000000] cma: Reserved 96 MiB at 0x00000001f0000000 on node -1 >> >> ... >> >> [ 0.000000] software IO TLB: SWIOTLB bounce buffer size adjusted to 3MB >> [ 0.000000] software IO TLB: area num 4. >> [ 0.000000] software IO TLB: mapped [mem >> 0x00000001fac00000-0x00000001fb000000] (4MB) >> >> On the fa3c109a6d30 (parent commit of the $subject) I got: >> >> [ 0.000000] NUMA: No NUMA configuration found >> [ 0.000000] NUMA: Faking a node at [mem >> 0x0000000000200000-0x00000001ffffffff] >> [ 0.000000] NUMA: NODE_DATA [mem 0x1ff7a0600-0x1ff7a2fff] >> [ 0.000000] Zone ranges: >> [ 0.000000] DMA [mem 0x0000000000200000-0x00000000ffffffff] >> [ 0.000000] DMA32 empty >> [ 0.000000] Normal [mem 0x0000000100000000-0x00000001ffffffff] >> [ 0.000000] Movable zone start for each node >> [ 0.000000] Early memory node ranges >> [ 0.000000] node 0: [mem 0x0000000000200000-0x00000000083fffff] >> [ 0.000000] node 0: [mem 0x0000000009400000-0x00000000efffffff] >> [ 0.000000] node 0: [mem 0x00000001f0000000-0x00000001ffffffff] >> [ 0.000000] Initmem setup node 0 [mem >> 0x0000000000200000-0x00000001ffffffff] >> [ 0.000000] On node 0, zone DMA: 512 pages in unavailable ranges >> [ 0.000000] On node 0, zone DMA: 4096 pages in unavailable ranges >> [ 0.000000] cma: Reserved 96 MiB at 0x00000000ea000000 on node -1 >> >> ... >> >> [ 0.000000] software IO TLB: area num 4. >> [ 0.000000] software IO TLB: mapped [mem >> 0x00000000e6000000-0x00000000ea000000] (64MB) >> >> It looks that for some reasons $subject patch changes the default zone >> and swiotlb configuration. > Does this fix the issue? > > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > index bfb10969cbf0..7fcd0aaa9bb6 100644 > --- a/arch/arm64/mm/init.c > +++ b/arch/arm64/mm/init.c > @@ -116,6 +116,9 @@ static void __init arch_reserve_crashkernel(void) > > static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) > { > + if (memblock_start_of_DRAM() < U32_MAX) > + zone_limit = min(zone_limit, U32_MAX); > + > return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; > } > Yes, this fixes my issue. Thanks! Fell free to add: Reported-by: Marek Szyprowski <m.szyprowski@samsung.com> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> Best regards
Hi, On 11/08/2024 09:09, Baruch Siach wrote: > From: Catalin Marinas <catalin.marinas@arm.com> > > Hardware DMA limit might not be power of 2. When RAM range starts above > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit > can not encode this limit. > > Use plain address for DMA zone limit. > > Since DMA zone can now potentially span beyond 4GB physical limit of > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. > > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> > Co-developed-by: Baruch Siach <baruch@tkos.co.il> > Signed-off-by: Baruch Siach <baruch@tkos.co.il> > --- > arch/arm64/mm/init.c | 30 +++++++++++++++--------------- > arch/powerpc/mm/mem.c | 5 ++++- > arch/s390/mm/init.c | 2 +- > include/linux/dma-direct.h | 2 +- > kernel/dma/direct.c | 6 +++--- > kernel/dma/pool.c | 4 ++-- > kernel/dma/swiotlb.c | 6 +++--- > 7 files changed, 29 insertions(+), 26 deletions(-) > <snip> This change breaks the Qualcomm SM8550-HDK boot since next-20240826. It doesn't affect SM8550-QRD or other similar SoCs like SM8650 or SM8450. The last CI run on next-20240828 can be found at: https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/pipelines/100936 SM8550-HDK boot log: https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/jobs/165617 bisect log: # bad: [b18bbfc14a38b5234e09c2adcf713e38063a7e6e] Add linux-next specific files for 20240829 # good: [5be63fc19fcaa4c236b307420483578a56986a37] Linux 6.11-rc5 git bisect start 'FETCH_HEAD' 'v6.11-rc5' # bad: [dc09f0263a0accf41821d260f4bf7ad9a4f7b7d8] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git git bisect bad dc09f0263a0accf41821d260f4bf7ad9a4f7b7d8 # bad: [97c7e618752776e03f50311400bb73c01489fb17] Merge branch 'for-next' of git://github.com/Xilinx/linux-xlnx.git git bisect bad 97c7e618752776e03f50311400bb73c01489fb17 # good: [27ad8eb339a5e3f96aed5f3a3b5901994ce7856d] Merge branch 'clang-format' of https://github.com/ojeda/linux.git git bisect good 27ad8eb339a5e3f96aed5f3a3b5901994ce7856d # bad: [fd34d49f6d8dbd157b17b675dc51a145cdad580c] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/khilman/linux-omap.git git bisect bad fd34d49f6d8dbd157b17b675dc51a145cdad580c # bad: [29f35e8cf8bff7c69a740edb1cf3d62d211f5a43] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux.git git bisect bad 29f35e8cf8bff7c69a740edb1cf3d62d211f5a43 # good: [41860d49473c0c09dc0a2a4d148047f97aaa2539] perf sched: Use perf_tool__init() git bisect good 41860d49473c0c09dc0a2a4d148047f97aaa2539 # good: [6236ebe07131a7746d870f1d8eb3637a8df13e70] perf daemon: Fix the build on more 32-bit architectures git bisect good 6236ebe07131a7746d870f1d8eb3637a8df13e70 # good: [92b0d033c80c882e6be26dfeeb2f24c53bdeeee6] Merge branches 'for-next/acpi', 'for-next/misc', 'for-next/perf', 'for-next/selftests' and 'for-next/timers' into for-next/core git bisect good 92b0d033c80c882e6be26dfeeb2f24c53bdeeee6 # bad: [5db893307df81f0c1dd6b6f167dc263e88ba855a] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rmk/linux.git git bisect bad 5db893307df81f0c1dd6b6f167dc263e88ba855a # bad: [f65eaf209abc5f4761623aaa1e110bdb6de124ed] Merge branch 'for-next' of git://git.infradead.org/users/hch/dma-mapping.git git bisect bad f65eaf209abc5f4761623aaa1e110bdb6de124ed # bad: [f69e342eec008e1bab772d3963c3dd9979293e13] dma-mapping: call ->unmap_page and ->unmap_sg unconditionally git bisect bad f69e342eec008e1bab772d3963c3dd9979293e13 # bad: [ba0fb44aed47693cc2482427f63ba6cd19051327] dma-mapping: replace zone_dma_bits by zone_dma_limit git bisect bad ba0fb44aed47693cc2482427f63ba6cd19051327 # good: [fa3c109a6d302b56437a6412c5f3044c3e12de03] dma-mapping: use bit masking to check VM_DMA_COHERENT git bisect good fa3c109a6d302b56437a6412c5f3044c3e12de03 # first bad commit: [ba0fb44aed47693cc2482427f63ba6cd19051327] dma-mapping: replace zone_dma_bits by zone_dma_limit Boot log with earlycon: [ 0.000000] Booting Linux on physical CPU 0x0000000000 [0x411fd461] [ 0.000000] Linux version 6.11.0-rc5-next-20240829-07694-gb36219e63903 (narmstrong@arrakeen.starnux.net) (aarch64-none-linux-gnu-gcc (Arm GNU Toolchain 13.2.rel1 (Build arm-13.7)) 13.2.1 20231009, GNU ld (Arm GNU Toolchain 13.2.rel1 (Build arm-13.7)) 2.41.0.20231009) #397 SMP PREEMPT Thu Aug 29 15:25:39 CEST 2024 [ 0.000000] KASLR enabled [ 0.000000] random: crng init done [ 0.000000] Machine model: Qualcomm Technologies, Inc. SM8550 HDK [ 0.000000] printk: debug: ignoring loglevel setting. [ 0.000000] efi: UEFI not found. [ 0.000000] [Firmware Bug]: Kernel image misaligned at boot, please fix your bootloader! [ 0.000000] OF: reserved mem: 0x0000000080000000..0x00000000809fffff (10240 KiB) nomap non-reusable hyp-region@80000000 [ 0.000000] OF: reserved mem: 0x0000000080a00000..0x0000000080dfffff (4096 KiB) nomap non-reusable cpusys-vm-region@80a00000 [ 0.000000] OF: reserved mem: 0x0000000080e00000..0x00000000811cffff (3904 KiB) nomap non-reusable hyp-tags-region@80e00000 [ 0.000000] OF: reserved mem: 0x00000000811d0000..0x00000000811fffff (192 KiB) nomap non-reusable hyp-tags-reserved-region@811d0000 [ 0.000000] OF: reserved mem: 0x0000000081a00000..0x0000000081c5ffff (2432 KiB) nomap non-reusable xbl-dt-log-merged-region@81a00000 [ 0.000000] OF: reserved mem: 0x0000000081c60000..0x0000000081c7ffff (128 KiB) nomap non-reusable aop-cmd-db-region@81c60000 [ 0.000000] OF: reserved mem: 0x0000000081c80000..0x0000000081cf3fff (464 KiB) nomap non-reusable aop-config-merged-region@81c80000 [ 0.000000] OF: reserved mem: 0x0000000081d00000..0x0000000081efffff (2048 KiB) nomap non-reusable smem@81d00000 [ 0.000000] OF: reserved mem: 0x0000000081f00000..0x0000000081f1ffff (128 KiB) nomap non-reusable adsp-mhi-region@81f00000 [ 0.000000] OF: reserved mem: 0x0000000082600000..0x00000000826fffff (1024 KiB) nomap non-reusable global-sync-region@82600000 [ 0.000000] OF: reserved mem: 0x0000000082700000..0x00000000827fffff (1024 KiB) nomap non-reusable tz-stat-region@82700000 [ 0.000000] OF: reserved mem: 0x0000000082800000..0x0000000086dfffff (71680 KiB) nomap non-reusable cdsp-secure-heap-region@82800000 [ 0.000000] OF: reserved mem: 0x000000008a800000..0x000000009affffff (270336 KiB) nomap non-reusable mpss-region@8a800000 [ 0.000000] OF: reserved mem: 0x000000009b000000..0x000000009b07ffff (512 KiB) nomap non-reusable q6-mpss-dtb-region@9b000000 [ 0.000000] OF: reserved mem: 0x000000009b080000..0x000000009b08ffff (64 KiB) nomap non-reusable ipa-fw-region@9b080000 [ 0.000000] OF: reserved mem: 0x000000009b090000..0x000000009b099fff (40 KiB) nomap non-reusable ipa-gsi-region@9b090000 [ 0.000000] OF: reserved mem: 0x000000009b09a000..0x000000009b09bfff (8 KiB) nomap non-reusable gpu-micro-code-region@9b09a000 [ 0.000000] OF: reserved mem: 0x000000009b100000..0x000000009b27ffff (1536 KiB) nomap non-reusable spss-region@9b100000 [ 0.000000] OF: reserved mem: 0x000000009b280000..0x000000009b2dffff (384 KiB) nomap non-reusable spu-tz-shared-region@9b280000 [ 0.000000] OF: reserved mem: 0x000000009b2e0000..0x000000009b2fffff (128 KiB) nomap non-reusable spu-modem-shared-region@9b2e0000 [ 0.000000] OF: reserved mem: 0x000000009b300000..0x000000009bafffff (8192 KiB) nomap non-reusable camera-region@9b300000 [ 0.000000] OF: reserved mem: 0x000000009bb00000..0x000000009c1fffff (7168 KiB) nomap non-reusable video-region@9bb00000 [ 0.000000] OF: reserved mem: 0x000000009c200000..0x000000009c8fffff (7168 KiB) nomap non-reusable cvp-region@9c200000 [ 0.000000] OF: reserved mem: 0x000000009c900000..0x000000009e8fffff (32768 KiB) nomap non-reusable cdsp-region@9c900000 [ 0.000000] OF: reserved mem: 0x000000009e900000..0x000000009e97ffff (512 KiB) nomap non-reusable q6-cdsp-dtb-region@9e900000 [ 0.000000] OF: reserved mem: 0x000000009e980000..0x000000009e9fffff (512 KiB) nomap non-reusable q6-adsp-dtb-region@9e980000 [ 0.000000] OF: reserved mem: 0x000000009ea00000..0x00000000a2a7ffff (66048 KiB) nomap non-reusable adspslpi-region@9ea00000 [ 0.000000] OF: reserved mem: 0x00000000d4a80000..0x00000000d4cfffff (2560 KiB) nomap non-reusable rmtfs-region@d4a80000 [ 0.000000] OF: reserved mem: 0x00000000d4d00000..0x00000000d7ffffff (52224 KiB) nomap non-reusable mpss-dsm-region@d4d00000 [ 0.000000] OF: reserved mem: 0x00000000d8000000..0x00000000d80fffff (1024 KiB) nomap non-reusable tz-reserved-region@d8000000 [ 0.000000] OF: reserved mem: 0x00000000d8100000..0x00000000d813ffff (256 KiB) nomap non-reusable xbl-sc-region@d8100000 [ 0.000000] OF: reserved mem: 0x00000000d8140000..0x00000000d82fffff (1792 KiB) nomap non-reusable cpucp-fw-region@d8140000 [ 0.000000] OF: reserved mem: 0x00000000d8300000..0x00000000d87fffff (5120 KiB) nomap non-reusable qtee-region@d8300000 [ 0.000000] OF: reserved mem: 0x00000000d8800000..0x00000000e11fffff (141312 KiB) nomap non-reusable ta-region@d8800000 [ 0.000000] OF: reserved mem: 0x00000000e1200000..0x00000000e393ffff (40192 KiB) nomap non-reusable tz-tags-region@e1200000 [ 0.000000] OF: reserved mem: 0x00000000e6440000..0x00000000e66b8fff (2532 KiB) nomap non-reusable hwfence-shbuf-region@e6440000 [ 0.000000] OF: reserved mem: 0x00000000f3600000..0x00000000f80edfff (76728 KiB) nomap non-reusable trust-ui-vm-region@f3600000 [ 0.000000] OF: reserved mem: 0x00000000f80ee000..0x00000000f80eefff (4 KiB) nomap non-reusable trust-ui-vm-dump-region@f80ee000 [ 0.000000] OF: reserved mem: 0x00000000f80ef000..0x00000000f80f7fff (36 KiB) nomap non-reusable trust-ui-vm-qrt-region@f80ef000 [ 0.000000] OF: reserved mem: 0x00000000f80f8000..0x00000000f80fbfff (16 KiB) nomap non-reusable trust-ui-vm-vblk0-ring-region@f80f8000 [ 0.000000] OF: reserved mem: 0x00000000f80fc000..0x00000000f80fffff (16 KiB) nomap non-reusable trust-ui-vm-vblk1-ring-region@f80fc000 [ 0.000000] OF: reserved mem: 0x00000000f8100000..0x00000000f81fffff (1024 KiB) nomap non-reusable trust-ui-vm-swiotlb-region@f8100000 [ 0.000000] OF: reserved mem: 0x00000000f8400000..0x00000000fcbfffff (73728 KiB) nomap non-reusable oem-vm-region@f8400000 [ 0.000000] OF: reserved mem: 0x00000000fcc00000..0x00000000fcc03fff (16 KiB) nomap non-reusable oem-vm-vblk0-ring-region@fcc00000 [ 0.000000] OF: reserved mem: 0x00000000fcc04000..0x00000000fcd03fff (1024 KiB) nomap non-reusable oem-vm-swiotlb-region@fcc04000 [ 0.000000] OF: reserved mem: 0x00000000fce00000..0x00000000ff6fffff (41984 KiB) nomap non-reusable hyp-ext-tags-region@fce00000 [ 0.000000] OF: reserved mem: 0x00000000ff700000..0x00000000ff7fffff (1024 KiB) nomap non-reusable hyp-ext-reserved-region@ff700000 [ 0.000000] earlycon: qcom_geni0 at MMIO 0x0000000000a9c000 (options '115200n8') [ 0.000000] printk: legacy bootconsole [qcom_geni0] enabled [ 0.000000] NUMA: Faking a node at [mem 0x0000000080000000-0x0000000affffffff] [ 0.000000] NODE_DATA(0) allocated [mem 0xafe933200-0xafe93583f] [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000080000000-0x0000000affffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal empty [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000080000000-0x0000000080dfffff] [ 0.000000] node 0: [mem 0x00000000811d0000-0x00000000811fffff] [ 0.000000] node 0: [mem 0x0000000081200000-0x00000000819fffff] [ 0.000000] node 0: [mem 0x0000000081a00000-0x0000000081cf3fff] [ 0.000000] node 0: [mem 0x0000000081cf4000-0x0000000081cfffff] [ 0.000000] node 0: [mem 0x0000000081d00000-0x0000000081f1ffff] [ 0.000000] node 0: [mem 0x0000000081f20000-0x00000000825fffff] [ 0.000000] node 0: [mem 0x0000000082600000-0x0000000086dfffff] [ 0.000000] node 0: [mem 0x0000000086e00000-0x000000008a7fffff] [ 0.000000] node 0: [mem 0x000000008a800000-0x000000009b09bfff] [ 0.000000] node 0: [mem 0x000000009b09c000-0x000000009b0fffff] [ 0.000000] node 0: [mem 0x000000009b100000-0x00000000a2a7ffff] [ 0.000000] node 0: [mem 0x00000000a2a80000-0x00000000d4a7ffff] [ 0.000000] node 0: [mem 0x00000000d4a80000-0x00000000d7ffffff] [ 0.000000] node 0: [mem 0x00000000d8140000-0x00000000d815ffff] [ 0.000000] node 0: [mem 0x00000000e1bb0000-0x00000000e393ffff] [ 0.000000] node 0: [mem 0x00000000e3940000-0x00000000e643ffff] [ 0.000000] node 0: [mem 0x00000000e6440000-0x00000000e66b8fff] [ 0.000000] node 0: [mem 0x00000000e66b9000-0x00000000f35fffff] [ 0.000000] node 0: [mem 0x00000000f3600000-0x00000000f81fffff] [ 0.000000] node 0: [mem 0x00000000f8200000-0x00000000f83fffff] [ 0.000000] node 0: [mem 0x00000000f8400000-0x00000000fcd03fff] [ 0.000000] node 0: [mem 0x00000000fcd04000-0x00000000fcdfffff] [ 0.000000] node 0: [mem 0x00000000fce00000-0x00000000ff7fffff] [ 0.000000] node 0: [mem 0x00000000ff800000-0x00000000ffffffff] [ 0.000000] node 0: [mem 0x0000000880000000-0x00000008b99fffff] [ 0.000000] node 0: [mem 0x00000008c0000000-0x0000000affffffff] [ 0.000000] Initmem setup node 0 [mem 0x0000000080000000-0x0000000affffffff] [ 0.000000] On node 0, zone DMA: 976 pages in unavailable ranges [ 0.000000] On node 0, zone DMA: 320 pages in unavailable ranges [ 0.000000] On node 0, zone DMA: 39504 pages in unavailable ranges [ 0.000000] On node 0, zone DMA: 26112 pages in unavailable ranges [ 0.000000] cma: Reserved 32 MiB at 0x0000000880000000 on node -1 [ 0.000000] psci: probing for conduit method from DT. [ 0.000000] psci: PSCIv1.1 detected in firmware. [ 0.000000] psci: Using standard PSCI v0.2 function IDs [ 0.000000] psci: MIGRATE_INFO_TYPE not supported. [ 0.000000] psci: SMC Calling Convention v1.3 [ 0.000000] psci: OSI mode supported. [ 0.000000] percpu: Embedded 25 pages/cpu s61656 r8192 d32552 u102400 [ 0.000000] pcpu-alloc: s61656 r8192 d32552 u102400 alloc=25*4096 [ 0.000000] pcpu-alloc: [0] 0 [0] 1 [0] 2 [0] 3 [0] 4 [0] 5 [0] 6 [0] 7 [ 0.000000] Detected PIPT I-cache on CPU0 [ 0.000000] CPU features: detected: Address authentication (architected QARMA5 algorithm) [ 0.000000] CPU features: detected: GIC system register CPU interface [ 0.000000] CPU features: detected: Spectre-v4 [ 0.000000] CPU features: detected: ARM erratum 2457168 [ 0.000000] CPU features: detected: ARM erratum 2658417 [ 0.000000] CPU features: detected: ARM errata 2966298, 3117295 [ 0.000000] alternatives: applying boot alternatives [ 0.000000] Kernel command line: earlycon root= allow_mismatched_32bit_el0 clk_ignore_unused pd_ignore_unused systemd.mask=rmtfs.service ignore_loglevel -- androidboot.verifiedbootstate=orange androidboot.keymaster=1 androidboot.bootdevice=1d84000.ufshc androidboot.fstab_suffix=default androidboot.boot_devices=soc/1d84000.ufshc androidboot.serialno=2a7a254f androidboot.baseband=apq androidboot.force_normal_boot=1 [ 0.000000] Dentry cache hash table entries: 2097152 (order: 12, 16777216 bytes, linear) [ 0.000000] Inode-cache hash table entries: 1048576 (order: 11, 8388608 bytes, linear) [ 0.000000] Fallback order for Node 0: 0 [ 0.000000] Built 1 zonelists, mobility grouping on. Total pages: 3078816 [ 0.000000] Policy zone: DMA [ 0.000000] mem auto-init: stack:all(zero), heap alloc:off, heap free:off [ 0.000000] software IO TLB: SWIOTLB bounce buffer size adjusted to 11MB [ 0.000000] software IO TLB: area num 8. [ 0.000000] software IO TLB: SWIOTLB bounce buffer size roundup to 16MB [ 0.000000] software IO TLB: mapped [mem 0x0000000aeff38000-0x0000000af0f38000] (16MB) [ 0.000000] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=8, Nodes=1 [ 0.000000] rcu: Preemptible hierarchical RCU implementation. [ 0.000000] rcu: RCU event tracing is enabled. [ 0.000000] rcu: RCU restricting CPUs from NR_CPUS=512 to nr_cpu_ids=8. [ 0.000000] Trampoline variant of Tasks RCU enabled. [ 0.000000] Tracing variant of Tasks RCU enabled. [ 0.000000] rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies. [ 0.000000] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=8 [ 0.000000] RCU Tasks: Setting shift to 3 and lim to 1 rcu_task_cb_adjust=1 rcu_task_cpu_ids=8. [ 0.000000] RCU Tasks Trace: Setting shift to 3 and lim to 1 rcu_task_cb_adjust=1 rcu_task_cpu_ids=8. [ 0.000000] NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0 [ 0.000000] GICv3: 988 SPIs implemented [ 0.000000] GICv3: 0 Extended SPIs implemented [ 0.000000] Root IRQ handler: gic_handle_irq [ 0.000000] GICv3: GICv3 features: 16 PPIs, DirectLPI [ 0.000000] GICv3: GICD_CTRL.DS=1, SCR_EL3.FIQ=0 [ 0.000000] GICv3: Enabling SGIs without active state [ 0.000000] GICv3: CPU0: found redistributor 0 region 0:0x0000000017180000 [ 0.000000] ITS [mem 0x17140000-0x1715ffff] [ 0.000000] ITS@0x0000000017140000: Devices Table too large, reduce ids 32->19 [ 0.000000] ITS@0x0000000017140000: Devices too large, reduce ITS pages 1024->256 [ 0.000000] ITS@0x0000000017140000: allocated 131072 Devices @81500000 (indirect, esz 8, psz 4K, shr 1) [ 0.000000] ITS@0x0000000017140000: allocated 4096 Interrupt Collections @8145b000 (flat, esz 1, psz 4K, shr 1) [ 0.000000] GICv3: using LPI property table @0x0000000081470000 [ 0.000000] GICv3: CPU0: using allocated LPI pending table @0x0000000081480000 [ 0.000000] rcu: srcu_init: Setting srcu_struct sizes based on contention. [ 0.000000] arch_timer: cp15 and mmio timer(s) running at 19.20MHz (virt/virt). [ 0.000000] clocksource: arch_sys_counter: mask: 0xffffffffffffff max_cycles: 0x46d987e47, max_idle_ns: 440795202767 ns [ 0.000000] sched_clock: 56 bits at 19MHz, resolution 52ns, wraps every 4398046511078ns [ 0.008355] arm-pv: using stolen time PV [ 0.012595] Console: colour dummy device 80x25 [ 0.017182] printk: legacy console [tty0] enabled [ 0.022031] printk: legacy bootconsole [qcom_geni0] disabled <-----------------> System handler takes over, I'll analyze the system firmware logs to understand which access made the crash. I tested the "arm64: mm: fix DMA zone when dma-ranges is missing" patch at [1], but it doesn't help. [1] https://lore.kernel.org/all/d8e92b14181fc815773ac4f4bba70b5d48bc390e.1724838684.git.baruch@tkos.co.il/ Neil #regzbot introduced: ba0fb44aed47693cc2482427f63ba6cd19051327
On 2024-08-29 2:42 pm, Neil Armstrong wrote: > Hi, > > On 11/08/2024 09:09, Baruch Siach wrote: >> From: Catalin Marinas <catalin.marinas@arm.com> >> >> Hardware DMA limit might not be power of 2. When RAM range starts above >> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit >> can not encode this limit. >> >> Use plain address for DMA zone limit. >> >> Since DMA zone can now potentially span beyond 4GB physical limit of >> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. >> >> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> >> Co-developed-by: Baruch Siach <baruch@tkos.co.il> >> Signed-off-by: Baruch Siach <baruch@tkos.co.il> >> --- >> arch/arm64/mm/init.c | 30 +++++++++++++++--------------- >> arch/powerpc/mm/mem.c | 5 ++++- >> arch/s390/mm/init.c | 2 +- >> include/linux/dma-direct.h | 2 +- >> kernel/dma/direct.c | 6 +++--- >> kernel/dma/pool.c | 4 ++-- >> kernel/dma/swiotlb.c | 6 +++--- >> 7 files changed, 29 insertions(+), 26 deletions(-) >> > > <snip> > > This change breaks the Qualcomm SM8550-HDK boot since next-20240826. > It doesn't affect SM8550-QRD or other similar SoCs like SM8650 or SM8450. > The last CI run on next-20240828 can be found at: > https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/pipelines/100936 > > SM8550-HDK boot log: > https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/jobs/165617 > [...] Yeah, a 35-bit ZONE_DMA is sure to make stuff go wrong: > [ 0.000000] Zone ranges: > [ 0.000000] DMA [mem 0x0000000080000000-0x0000000affffffff] > [ 0.000000] DMA32 empty > [ 0.000000] Normal empty Compared to before: [ 0.000000] DMA [mem 0x0000000080000000-0x00000000ffffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal [mem 0x0000000100000000-0x0000000affffffff] This'll be because the SoC DT is describing a general non-restrictive range: dma-ranges = <0 0 0 0 0x10 0>; Which proves we need more information than {acpi,of}_dma_get_max_cpu_address() are currently able to give us, because what zone_dma_limit actually wants to be is the *minimum* of the lowest highest CPU address of any DMA range, and the lowest CPU address of any DMA range + 2^32. I was thinking it had all ended up looking a bit too easy... :) I think v1 of the fix[1] might actually work out for this, albeit still for the wrong reasons - if so, I concede that maybe at this point it might be safest to go back to that one as a quick short-term fix (with a big fat comment to say so) rather than try to rush the proper solution or revert everything. Thanks, Robin. [1] https://lore.kernel.org/linux-arm-kernel/731d204f5f556ad61bbaf004b1d984f83c90b4f5.1724748249.git.baruch@tkos.co.il/
Hi Robin, On 29/08/2024 16:38, Robin Murphy wrote: > On 2024-08-29 2:42 pm, Neil Armstrong wrote: >> Hi, >> >> On 11/08/2024 09:09, Baruch Siach wrote: >>> From: Catalin Marinas <catalin.marinas@arm.com> >>> >>> Hardware DMA limit might not be power of 2. When RAM range starts above >>> 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit >>> can not encode this limit. >>> >>> Use plain address for DMA zone limit. >>> >>> Since DMA zone can now potentially span beyond 4GB physical limit of >>> DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. >>> >>> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> >>> Co-developed-by: Baruch Siach <baruch@tkos.co.il> >>> Signed-off-by: Baruch Siach <baruch@tkos.co.il> >>> --- >>> arch/arm64/mm/init.c | 30 +++++++++++++++--------------- >>> arch/powerpc/mm/mem.c | 5 ++++- >>> arch/s390/mm/init.c | 2 +- >>> include/linux/dma-direct.h | 2 +- >>> kernel/dma/direct.c | 6 +++--- >>> kernel/dma/pool.c | 4 ++-- >>> kernel/dma/swiotlb.c | 6 +++--- >>> 7 files changed, 29 insertions(+), 26 deletions(-) >>> >> >> <snip> >> >> This change breaks the Qualcomm SM8550-HDK boot since next-20240826. >> It doesn't affect SM8550-QRD or other similar SoCs like SM8650 or SM8450. >> The last CI run on next-20240828 can be found at: >> https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/pipelines/100936 >> >> SM8550-HDK boot log: >> https://git.codelinaro.org/linaro/qcomlt/ci/staging/cdba-tester/-/jobs/165617 >> > [...] > > Yeah, a 35-bit ZONE_DMA is sure to make stuff go wrong: > >> [ 0.000000] Zone ranges: >> [ 0.000000] DMA [mem 0x0000000080000000-0x0000000affffffff] >> [ 0.000000] DMA32 empty >> [ 0.000000] Normal empty > > Compared to before: > > [ 0.000000] DMA [mem 0x0000000080000000-0x00000000ffffffff] > [ 0.000000] DMA32 empty > [ 0.000000] Normal [mem 0x0000000100000000-0x0000000affffffff] > > This'll be because the SoC DT is describing a general non-restrictive range: > dma-ranges = <0 0 0 0 0x10 0>; > > Which proves we need more information than {acpi,of}_dma_get_max_cpu_address() are currently able to give us, because what zone_dma_limit actually wants to be is the *minimum* of the lowest highest CPU address of any DMA range, and the lowest CPU address of any DMA range + 2^32. I was thinking it had all ended up looking a bit too easy... :) > > I think v1 of the fix[1] might actually work out for this, albeit still for the wrong reasons - if so, I concede that maybe at this point it might be safest to go back to that one as a quick short-term fix (with a big fat comment to say so) rather than try to rush the proper solution or revert everything. Indeed v1 patches makes boot work again: [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000080000000-0x00000000ffffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal [mem 0x0000000100000000-0x0000000affffffff] Please add my: Tested-by: Neil Armstrong <neil.armstrong@linaro.org> # on SM8550-HDK Thanks, Neil > > Thanks, > Robin. > > [1] https://lore.kernel.org/linux-arm-kernel/731d204f5f556ad61bbaf004b1d984f83c90b4f5.1724748249.git.baruch@tkos.co.il/
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9b5ab6818f7f..c45e2152ca9e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) } /* - * Return the maximum physical address for a zone accessible by the given bits - * limit. If DRAM starts above 32-bit, expand the zone to the maximum + * Return the maximum physical address for a zone given its limit. + * If DRAM starts above 32-bit, expand the zone to the maximum * available memory, otherwise cap it at 32-bit. */ -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) { - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); phys_addr_t phys_start = memblock_start_of_DRAM(); if (phys_start > U32_MAX) - zone_mask = PHYS_ADDR_MAX; - else if (phys_start > zone_mask) - zone_mask = U32_MAX; + zone_limit = PHYS_ADDR_MAX; + else if (phys_start > zone_limit) + zone_limit = U32_MAX; - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; } static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; - unsigned int __maybe_unused acpi_zone_dma_bits; - unsigned int __maybe_unused dt_zone_dma_bits; - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); + phys_addr_t __maybe_unused acpi_zone_dma_limit; + phys_addr_t __maybe_unused dt_zone_dma_limit; + phys_addr_t __maybe_unused dma32_phys_limit = + max_zone_phys(DMA_BIT_MASK(32)); #ifdef CONFIG_ZONE_DMA - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index d325217ab201..05b7f702b3f7 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void) * everything else. GFP_DMA32 page allocations automatically fall back to * ZONE_DMA. * - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by * ZONE_DMA. @@ -230,6 +230,7 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); + int zone_dma_bits; #ifdef CONFIG_HIGHMEM unsigned long v = __fix_to_virt(FIX_KMAP_END); @@ -256,6 +257,8 @@ void __init paging_init(void) else zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); + #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = min(max_low_pfn, 1UL << (zone_dma_bits - PAGE_SHIFT)); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ddcd39ef4346..91fc2b91adfc 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -97,7 +97,7 @@ void __init paging_init(void) vmem_map_init(); sparse_init(); - zone_dma_bits = 31; + zone_dma_limit = DMA_BIT_MASK(31); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index edbe13d00776..d7e30d4f7503 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -12,7 +12,7 @@ #include <linux/mem_encrypt.h> #include <linux/swiotlb.h> -extern unsigned int zone_dma_bits; +extern u64 zone_dma_limit; /* * Record the mapping of CPU physical to DMA addresses for a given region. diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 4480a3cd92e0..f2ba074a6a54 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -20,7 +20,7 @@ * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ -unsigned int zone_dma_bits __ro_after_init = 24; +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) * zones. */ *phys_limit = dma_to_phys(dev, dma_limit); - if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) + if (*phys_limit <= zone_dma_limit) return GFP_DMA; if (*phys_limit <= DMA_BIT_MASK(32)) return GFP_DMA32; @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask) * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); + min_mask = min_t(u64, min_mask, zone_dma_limit); return mask >= phys_to_dma_unencrypted(dev, min_mask); } diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index d10613eb0f63..7b04f7575796 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) /* CMA can't cross zone boundaries, see cma_activate_area() */ end = cma_get_base(cma) + size - 1; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) - return end <= DMA_BIT_MASK(zone_dma_bits); + return end <= zone_dma_limit; if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) - return end <= DMA_BIT_MASK(32); + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); return true; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index df68d29740a0..abcf3fa63a56 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, if (!remap) io_tlb_default_mem.can_grow = true; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); + io_tlb_default_mem.phys_limit = zone_dma_limit; else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); else io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); #endif @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, } gfp &= ~GFP_ZONEMASK; - if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) + if (phys_limit <= zone_dma_limit) gfp |= __GFP_DMA; else if (phys_limit <= DMA_BIT_MASK(32)) gfp |= __GFP_DMA32;