Message ID | 20240806105135.218089-1-amhetre@nvidia.com |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | [V4] iommu/io-pgtable-arm: Optimise non-coherent unmap | expand |
On 8/6/2024 4:21 PM, Ashish Mhetre wrote: > The current __arm_lpae_unmap() function calls dma_sync() on individual > PTEs after clearing them. Overall unmap performance can be improved by > around 25% for large buffer sizes by combining the syncs for adjacent > leaf entries. > Optimize the unmap time by clearing all the leaf entries and issuing a > single dma_sync() for them. > Below is detailed analysis of average unmap latency(in us) with and > without this optimization obtained by running dma_map_benchmark for > different buffer sizes. > > UnMap Latency(us) > Size Without With % gain with > optimiztion optimization optimization > > 4KB 3 3 0 > 8KB 4 3.8 5 > 16KB 6.1 5.4 11.48 > 32KB 10.2 8.5 16.67 > 64KB 18.5 14.9 19.46 > 128KB 35 27.5 21.43 > 256KB 67.5 52.2 22.67 > 512KB 127.9 97.2 24.00 > 1MB 248.6 187.4 24.62 > 2MB 65.5 65.5 0 > 4MB 119.2 119 0.17 > > Reviewed-by: Robin Murphy <robin.murphy@arm.com> > Signed-off-by: Ashish Mhetre <amhetre@nvidia.com> > --- > Changes in V2: > - Updated the commit message to be imperative. > - Fixed ptep at incorrect index getting cleared for non-leaf entries. > > Changes in V3: > - Used loop-local variables and removed redundant function variables. > - Added check for zero-sized dma_sync in __arm_lpae_clear_pte(). > - Merged both patches into this single patch by adding check for a > NULL gather in __arm_lpae_unmap() itself. > > Changes in V4: > - Updated the subject in commit message to correctly reflect the changes > made in this patch. > --- > drivers/iommu/io-pgtable-arm.c | 31 +++++++++++++++++-------------- > 1 file changed, 17 insertions(+), 14 deletions(-) > > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c > index f5d9fd1f45bf..6fecf3d9fe67 100644 > --- a/drivers/iommu/io-pgtable-arm.c > +++ b/drivers/iommu/io-pgtable-arm.c > @@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, > sizeof(*ptep) * num_entries, DMA_TO_DEVICE); > } > > -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg) > +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries) > { > + for (int i = 0; i < num_entries; i++) > + ptep[i] = 0; > > - *ptep = 0; > - > - if (!cfg->coherent_walk) > - __arm_lpae_sync_pte(ptep, 1, cfg); > + if (!cfg->coherent_walk && num_entries) > + __arm_lpae_sync_pte(ptep, num_entries, cfg); > } > > static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, > @@ -654,26 +654,29 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, > max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start; > num_entries = min_t(int, pgcount, max_entries); > > - while (i < num_entries) { > - pte = READ_ONCE(*ptep); > + /* Find and handle non-leaf entries */ > + for (i = 0; i < num_entries; i++) { > + pte = READ_ONCE(ptep[i]); > if (WARN_ON(!pte)) > break; > > - __arm_lpae_clear_pte(ptep, &iop->cfg); > - > if (!iopte_leaf(pte, lvl, iop->fmt)) { > + __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1); > + > /* Also flush any partial walks */ > io_pgtable_tlb_flush_walk(iop, iova + i * size, size, > ARM_LPAE_GRANULE(data)); > __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); > - } else if (!iommu_iotlb_gather_queued(gather)) { > - io_pgtable_tlb_add_page(iop, gather, iova + i * size, size); > } > - > - ptep++; > - i++; > } > > + /* Clear the remaining entries */ > + __arm_lpae_clear_pte(ptep, &iop->cfg, i); > + > + if (gather && !iommu_iotlb_gather_queued(gather)) > + for (int j = 0; j < i; j++) > + io_pgtable_tlb_add_page(iop, gather, iova + j * size, size); > + > return i * size; > } else if (iopte_leaf(pte, lvl, iop->fmt)) { > /* Hi all, Can you please review this patch and provide feedback? Thanks, Ashish Mhetre
On 8/12/2024 9:53 AM, Ashish Mhetre wrote: > External email: Use caution opening links or attachments > > > On 8/6/2024 4:21 PM, Ashish Mhetre wrote: >> The current __arm_lpae_unmap() function calls dma_sync() on individual >> PTEs after clearing them. Overall unmap performance can be improved by >> around 25% for large buffer sizes by combining the syncs for adjacent >> leaf entries. >> Optimize the unmap time by clearing all the leaf entries and issuing a >> single dma_sync() for them. >> Below is detailed analysis of average unmap latency(in us) with and >> without this optimization obtained by running dma_map_benchmark for >> different buffer sizes. >> >> UnMap Latency(us) >> Size Without With % gain with >> optimiztion optimization optimization >> >> 4KB 3 3 0 >> 8KB 4 3.8 5 >> 16KB 6.1 5.4 11.48 >> 32KB 10.2 8.5 16.67 >> 64KB 18.5 14.9 19.46 >> 128KB 35 27.5 21.43 >> 256KB 67.5 52.2 22.67 >> 512KB 127.9 97.2 24.00 >> 1MB 248.6 187.4 24.62 >> 2MB 65.5 65.5 0 >> 4MB 119.2 119 0.17 >> >> Reviewed-by: Robin Murphy <robin.murphy@arm.com> >> Signed-off-by: Ashish Mhetre <amhetre@nvidia.com> >> --- >> Changes in V2: >> - Updated the commit message to be imperative. >> - Fixed ptep at incorrect index getting cleared for non-leaf entries. >> >> Changes in V3: >> - Used loop-local variables and removed redundant function variables. >> - Added check for zero-sized dma_sync in __arm_lpae_clear_pte(). >> - Merged both patches into this single patch by adding check for a >> NULL gather in __arm_lpae_unmap() itself. >> >> Changes in V4: >> - Updated the subject in commit message to correctly reflect the changes >> made in this patch. >> --- >> drivers/iommu/io-pgtable-arm.c | 31 +++++++++++++++++-------------- >> 1 file changed, 17 insertions(+), 14 deletions(-) >> >> diff --git a/drivers/iommu/io-pgtable-arm.c >> b/drivers/iommu/io-pgtable-arm.c >> index f5d9fd1f45bf..6fecf3d9fe67 100644 >> --- a/drivers/iommu/io-pgtable-arm.c >> +++ b/drivers/iommu/io-pgtable-arm.c >> @@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte >> *ptep, int num_entries, >> sizeof(*ptep) * num_entries, >> DMA_TO_DEVICE); >> } >> >> -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct >> io_pgtable_cfg *cfg) >> +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct >> io_pgtable_cfg *cfg, int num_entries) >> { >> + for (int i = 0; i < num_entries; i++) >> + ptep[i] = 0; >> >> - *ptep = 0; >> - >> - if (!cfg->coherent_walk) >> - __arm_lpae_sync_pte(ptep, 1, cfg); >> + if (!cfg->coherent_walk && num_entries) >> + __arm_lpae_sync_pte(ptep, num_entries, cfg); >> } >> >> static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, >> @@ -654,26 +654,29 @@ static size_t __arm_lpae_unmap(struct >> arm_lpae_io_pgtable *data, >> max_entries = ARM_LPAE_PTES_PER_TABLE(data) - >> unmap_idx_start; >> num_entries = min_t(int, pgcount, max_entries); >> >> - while (i < num_entries) { >> - pte = READ_ONCE(*ptep); >> + /* Find and handle non-leaf entries */ >> + for (i = 0; i < num_entries; i++) { >> + pte = READ_ONCE(ptep[i]); >> if (WARN_ON(!pte)) >> break; >> >> - __arm_lpae_clear_pte(ptep, &iop->cfg); >> - >> if (!iopte_leaf(pte, lvl, iop->fmt)) { >> + __arm_lpae_clear_pte(&ptep[i], >> &iop->cfg, 1); >> + >> /* Also flush any partial walks */ >> io_pgtable_tlb_flush_walk(iop, iova + i >> * size, size, >> ARM_LPAE_GRANULE(data)); >> __arm_lpae_free_pgtable(data, lvl + 1, >> iopte_deref(pte, data)); >> - } else if (!iommu_iotlb_gather_queued(gather)) { >> - io_pgtable_tlb_add_page(iop, gather, >> iova + i * size, size); >> } >> - >> - ptep++; >> - i++; >> } >> >> + /* Clear the remaining entries */ >> + __arm_lpae_clear_pte(ptep, &iop->cfg, i); >> + >> + if (gather && !iommu_iotlb_gather_queued(gather)) >> + for (int j = 0; j < i; j++) >> + io_pgtable_tlb_add_page(iop, gather, >> iova + j * size, size); >> + >> return i * size; >> } else if (iopte_leaf(pte, lvl, iop->fmt)) { >> /* > > Hi all, > > Can you please review this patch and provide feedback? > > Thanks, > Ashish Mhetre > > Hi Will, Joerg, The patch is reviewed by Robin. If there are no more comments, can you please see if you can merge it? Thanks and Regards, Ashish Mhetre
On Tue, Aug 06, 2024 at 10:51:35AM +0000, Ashish Mhetre wrote: > The current __arm_lpae_unmap() function calls dma_sync() on individual > PTEs after clearing them. Overall unmap performance can be improved by > around 25% for large buffer sizes by combining the syncs for adjacent > leaf entries. > Optimize the unmap time by clearing all the leaf entries and issuing a > single dma_sync() for them. > Below is detailed analysis of average unmap latency(in us) with and > without this optimization obtained by running dma_map_benchmark for > different buffer sizes. > > UnMap Latency(us) > Size Without With % gain with > optimiztion optimization optimization > > 4KB 3 3 0 > 8KB 4 3.8 5 > 16KB 6.1 5.4 11.48 > 32KB 10.2 8.5 16.67 > 64KB 18.5 14.9 19.46 > 128KB 35 27.5 21.43 > 256KB 67.5 52.2 22.67 > 512KB 127.9 97.2 24.00 > 1MB 248.6 187.4 24.62 > 2MB 65.5 65.5 0 > 4MB 119.2 119 0.17 > > Reviewed-by: Robin Murphy <robin.murphy@arm.com> > Signed-off-by: Ashish Mhetre <amhetre@nvidia.com> > --- > Changes in V2: > - Updated the commit message to be imperative. > - Fixed ptep at incorrect index getting cleared for non-leaf entries. > > Changes in V3: > - Used loop-local variables and removed redundant function variables. > - Added check for zero-sized dma_sync in __arm_lpae_clear_pte(). > - Merged both patches into this single patch by adding check for a > NULL gather in __arm_lpae_unmap() itself. > > Changes in V4: > - Updated the subject in commit message to correctly reflect the changes > made in this patch. > --- > drivers/iommu/io-pgtable-arm.c | 31 +++++++++++++++++-------------- > 1 file changed, 17 insertions(+), 14 deletions(-) Acked-by: Will Deacon <will@kernel.org> Joerg, please can you pick this one up for -next? Cheers, Will
On Tue, Aug 06, 2024 at 10:51:35AM +0000, Ashish Mhetre wrote: > drivers/iommu/io-pgtable-arm.c | 31 +++++++++++++++++-------------- > 1 file changed, 17 insertions(+), 14 deletions(-) Applied, thanks.
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index f5d9fd1f45bf..6fecf3d9fe67 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, sizeof(*ptep) * num_entries, DMA_TO_DEVICE); } -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg) +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries) { + for (int i = 0; i < num_entries; i++) + ptep[i] = 0; - *ptep = 0; - - if (!cfg->coherent_walk) - __arm_lpae_sync_pte(ptep, 1, cfg); + if (!cfg->coherent_walk && num_entries) + __arm_lpae_sync_pte(ptep, num_entries, cfg); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, @@ -654,26 +654,29 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start; num_entries = min_t(int, pgcount, max_entries); - while (i < num_entries) { - pte = READ_ONCE(*ptep); + /* Find and handle non-leaf entries */ + for (i = 0; i < num_entries; i++) { + pte = READ_ONCE(ptep[i]); if (WARN_ON(!pte)) break; - __arm_lpae_clear_pte(ptep, &iop->cfg); - if (!iopte_leaf(pte, lvl, iop->fmt)) { + __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1); + /* Also flush any partial walks */ io_pgtable_tlb_flush_walk(iop, iova + i * size, size, ARM_LPAE_GRANULE(data)); __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); - } else if (!iommu_iotlb_gather_queued(gather)) { - io_pgtable_tlb_add_page(iop, gather, iova + i * size, size); } - - ptep++; - i++; } + /* Clear the remaining entries */ + __arm_lpae_clear_pte(ptep, &iop->cfg, i); + + if (gather && !iommu_iotlb_gather_queued(gather)) + for (int j = 0; j < i; j++) + io_pgtable_tlb_add_page(iop, gather, iova + j * size, size); + return i * size; } else if (iopte_leaf(pte, lvl, iop->fmt)) { /*