diff mbox series

[net-next,4/4] net: mlx5: add xdp tx return bulking support

Message ID 3fb334388ac7af755e1f03abb76a0a6335a90ff6.1603824486.git.lorenzo@kernel.org
State Not Applicable
Delegated to: BPF Maintainers
Headers show
Series xdp: introduce bulking for page_pool tx return path | expand

Checks

Context Check Description
jkicinski/cover_letter success Link
jkicinski/fixes_present success Link
jkicinski/patch_count success Link
jkicinski/tree_selection success Clearly marked for net-next
jkicinski/subject_prefix success Link
jkicinski/source_inline success Was 0 now: 0
jkicinski/verify_signedoff success Link
jkicinski/module_param success Was 0 now: 0
jkicinski/build_32bit fail Errors and warnings before: 4 this patch: 4
jkicinski/kdoc success Errors and warnings before: 0 this patch: 0
jkicinski/verify_fixes success Link
jkicinski/checkpatch fail Link
jkicinski/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
jkicinski/header_inline success Link
jkicinski/stable success Stable not CCed

Commit Message

Lorenzo Bianconi Oct. 27, 2020, 7:04 p.m. UTC
Convert mlx5 driver to xdp_return_frame_bulk APIs.

XDP_REDIRECT (upstream codepath): 8.5Mpps
XDP_REDIRECT (upstream codepath + bulking APIs): 10.1Mpps

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Comments

Shay Agroskin Oct. 29, 2020, 11:42 a.m. UTC | #1
Lorenzo Bianconi <lorenzo@kernel.org> writes:

> Convert mlx5 driver to xdp_return_frame_bulk APIs.
>
> XDP_REDIRECT (upstream codepath): 8.5Mpps
> XDP_REDIRECT (upstream codepath + bulking APIs): 10.1Mpps
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> index ae90d533a350..5fdfbf390d5c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> @@ -369,8 +369,10 @@ static void mlx5e_free_xdpsq_desc(struct 
> mlx5e_xdpsq *sq,
>  				  bool recycle)
>  {
>  	struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo;
> +	struct xdp_frame_bulk bq;
>  	u16 i;
>  
> +	bq.xa = NULL;
>  	for (i = 0; i < wi->num_pkts; i++) {
>  		struct mlx5e_xdp_info xdpi = 
>  mlx5e_xdpi_fifo_pop(xdpi_fifo);
>  
> @@ -379,7 +381,7 @@ static void mlx5e_free_xdpsq_desc(struct 
> mlx5e_xdpsq *sq,
>  			/* XDP_TX from the XSK RQ and XDP_REDIRECT 
>  */
>  			dma_unmap_single(sq->pdev, 
>  xdpi.frame.dma_addr,
>  					 xdpi.frame.xdpf->len, 
>  DMA_TO_DEVICE);
> -			xdp_return_frame(xdpi.frame.xdpf);
> +			xdp_return_frame_bulk(xdpi.frame.xdpf, 
> &bq);
>  			break;
>  		case MLX5E_XDP_XMIT_MODE_PAGE:
>  			/* XDP_TX from the regular RQ */
> @@ -393,6 +395,7 @@ static void mlx5e_free_xdpsq_desc(struct 
> mlx5e_xdpsq *sq,
>  			WARN_ON_ONCE(true);
>  		}
>  	}
> +	xdp_flush_frame_bulk(&bq);

While I understand the rational behind this patchset, using an 
intermediate buffer
	void *q[XDP_BULK_QUEUE_SIZE];
means more pressure on the data cache.

At the time I ran performance tests on mlx5 to see whether 
batching skbs before passing them to GRO would improve 
performance. On some flows I got worse performance.
This function seems to have less Dcache contention than RX flow, 
but maybe some performance testing are needed here.

>  }
>  
>  bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
Lorenzo Bianconi Oct. 29, 2020, 1:15 p.m. UTC | #2
> 
> Lorenzo Bianconi <lorenzo@kernel.org> writes:
> 
> > Convert mlx5 driver to xdp_return_frame_bulk APIs.
> > 
> > XDP_REDIRECT (upstream codepath): 8.5Mpps
> > XDP_REDIRECT (upstream codepath + bulking APIs): 10.1Mpps
> > 
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 5 ++++-
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> > b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> > index ae90d533a350..5fdfbf390d5c 100644
> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> > @@ -369,8 +369,10 @@ static void mlx5e_free_xdpsq_desc(struct
> > mlx5e_xdpsq *sq,
> >  				  bool recycle)
> >  {
> >  	struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo;
> > +	struct xdp_frame_bulk bq;
> >  	u16 i;
> > +	bq.xa = NULL;
> >  	for (i = 0; i < wi->num_pkts; i++) {
> >  		struct mlx5e_xdp_info xdpi =  mlx5e_xdpi_fifo_pop(xdpi_fifo);
> >   @@ -379,7 +381,7 @@ static void mlx5e_free_xdpsq_desc(struct
> > mlx5e_xdpsq *sq,
> >  			/* XDP_TX from the XSK RQ and XDP_REDIRECT  */
> >  			dma_unmap_single(sq->pdev,  xdpi.frame.dma_addr,
> >  					 xdpi.frame.xdpf->len,  DMA_TO_DEVICE);
> > -			xdp_return_frame(xdpi.frame.xdpf);
> > +			xdp_return_frame_bulk(xdpi.frame.xdpf, &bq);
> >  			break;
> >  		case MLX5E_XDP_XMIT_MODE_PAGE:
> >  			/* XDP_TX from the regular RQ */
> > @@ -393,6 +395,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq
> > *sq,
> >  			WARN_ON_ONCE(true);
> >  		}
> >  	}
> > +	xdp_flush_frame_bulk(&bq);
> 
> While I understand the rational behind this patchset, using an intermediate
> buffer
> 	void *q[XDP_BULK_QUEUE_SIZE];
> means more pressure on the data cache.
> 
> At the time I ran performance tests on mlx5 to see whether batching skbs
> before passing them to GRO would improve performance. On some flows I got
> worse performance.
> This function seems to have less Dcache contention than RX flow, but maybe
> some performance testing are needed here.

Hi Shay,

this codepath is only activated for "redirected" frames (not for packets
forwarded to networking stack). We run performance comparisons with the
upstream code for this particular use-case and we reported a nice
improvements (8.5Mpps vs 10.1Mpps).
Do you have in mind other possible performance tests to run?

Regards,
Lorenzo

> 
> >  }
> >  bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index ae90d533a350..5fdfbf390d5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -369,8 +369,10 @@  static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
 				  bool recycle)
 {
 	struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo;
+	struct xdp_frame_bulk bq;
 	u16 i;
 
+	bq.xa = NULL;
 	for (i = 0; i < wi->num_pkts; i++) {
 		struct mlx5e_xdp_info xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo);
 
@@ -379,7 +381,7 @@  static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
 			/* XDP_TX from the XSK RQ and XDP_REDIRECT */
 			dma_unmap_single(sq->pdev, xdpi.frame.dma_addr,
 					 xdpi.frame.xdpf->len, DMA_TO_DEVICE);
-			xdp_return_frame(xdpi.frame.xdpf);
+			xdp_return_frame_bulk(xdpi.frame.xdpf, &bq);
 			break;
 		case MLX5E_XDP_XMIT_MODE_PAGE:
 			/* XDP_TX from the regular RQ */
@@ -393,6 +395,7 @@  static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
 			WARN_ON_ONCE(true);
 		}
 	}
+	xdp_flush_frame_bulk(&bq);
 }
 
 bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)