diff mbox series

[net-next] dpaa2-eth: Don't use netif_receive_skb_list for TCP frames

Message ID 1563902923-26178-1-git-send-email-ruxandra.radulescu@nxp.com
State Changes Requested
Delegated to: David Miller
Headers show
Series [net-next] dpaa2-eth: Don't use netif_receive_skb_list for TCP frames | expand

Commit Message

Ioana Radulescu July 23, 2019, 5:28 p.m. UTC
Using Rx skb bulking for all frames may negatively impact the
performance in some TCP termination scenarios, as it effectively
bypasses GRO.

Look at the hardware parse results of each ingress frame to see
if a TCP header is present or not; for TCP frames fall back to
the old implementation.

Signed-off-by: Ioana Radulescu <ruxandra.radulescu@nxp.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 15 ++++++-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 51 ++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

Comments

Saeed Mahameed July 23, 2019, 5:54 p.m. UTC | #1
On Tue, 2019-07-23 at 20:28 +0300, Ioana Radulescu wrote:
> Using Rx skb bulking for all frames may negatively impact the
> performance in some TCP termination scenarios, as it effectively
> bypasses GRO.
> 
> Look at the hardware parse results of each ingress frame to see
> if a TCP header is present or not; for TCP frames fall back to
> the old implementation.
> 
> Signed-off-by: Ioana Radulescu <ruxandra.radulescu@nxp.com>
> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
> ---
>  drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 15 ++++++-
>  drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 51
> ++++++++++++++++++++++++
>  2 files changed, 65 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
> b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
> index 0acb115..412f87f 100644
> --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
> +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
> @@ -348,6 +348,16 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
>  	return xdp_act;
>  }
>  
> +static bool frame_is_tcp(const struct dpaa2_fd *fd, struct dpaa2_fas
> *fas)
> +{
> +	struct dpaa2_fapr *fapr = dpaa2_get_fapr(fas, false);
> +
> +	if (!(dpaa2_fd_get_frc(fd) & DPAA2_FD_FRC_FAPRV))
> +		return false;
> +
> +	return !!(fapr->faf_hi & DPAA2_FAF_HI_TCP_PRESENT);
> +}
> +
>  /* Main Rx frame processing routine */
>  static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
>  			 struct dpaa2_eth_channel *ch,
> @@ -435,7 +445,10 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv
> *priv,
>  	percpu_stats->rx_packets++;
>  	percpu_stats->rx_bytes += dpaa2_fd_get_len(fd);
>  
> -	list_add_tail(&skb->list, ch->rx_list);
> +	if (frame_is_tcp(fd, fas))
> +		napi_gro_receive(&ch->napi, skb);
> +	else
> +		list_add_tail(&skb->list, ch->rx_list);
>  

Maybe it is better if we introduce napi_gro_receive_list() 
and let napi decide when to do napi_gro_receive or list_add_tail, gro
stack already knows how to make the decision, (no need to have a
specialized hw parser for TCP frames) so all drivers will benefit from
this and we will not repeat the same thing you did here in all
drivers. 

also in such case napi/napi_gro will maintain the temporary rx skb
list, which seems more natural than implementing it per driver.

On napi_gro_receive_list:
 1) Try gro
 2) if the result is GRO_NORMAL, then add the skb to list
 3) on napi_complete or when list budget is consumed flush skb list by
calling netif_receive_skb_list.

>  	return;
>  
> diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
> b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
> index 9af18c2..d723ae7 100644
> --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
> +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
> @@ -155,6 +155,49 @@ struct dpaa2_fas {
>   */
>  #define DPAA2_TS_OFFSET			0x8
>  
> +/* Frame annotation parse results */
> +struct dpaa2_fapr {
> +	/* 64-bit word 1 */
> +	__le32 faf_lo;
> +	__le16 faf_ext;
> +	__le16 nxt_hdr;
> +	/* 64-bit word 2 */
> +	__le64 faf_hi;
> +	/* 64-bit word 3 */
> +	u8 last_ethertype_offset;
> +	u8 vlan_tci_offset_n;
> +	u8 vlan_tci_offset_1;
> +	u8 llc_snap_offset;
> +	u8 eth_offset;
> +	u8 ip1_pid_offset;
> +	u8 shim_offset_2;
> +	u8 shim_offset_1;
> +	/* 64-bit word 4 */
> +	u8 l5_offset;
> +	u8 l4_offset;
> +	u8 gre_offset;
> +	u8 l3_offset_n;
> +	u8 l3_offset_1;
> +	u8 mpls_offset_n;
> +	u8 mpls_offset_1;
> +	u8 pppoe_offset;
> +	/* 64-bit word 5 */
> +	__le16 running_sum;
> +	__le16 gross_running_sum;
> +	u8 ipv6_frag_offset;
> +	u8 nxt_hdr_offset;
> +	u8 routing_hdr_offset_2;
> +	u8 routing_hdr_offset_1;
> +	/* 64-bit word 6 */
> +	u8 reserved[5]; /* Soft-parsing context */
> +	u8 ip_proto_offset_n;
> +	u8 nxt_hdr_frag_offset;
> +	u8 parse_error_code;
> +};
> +
> +#define DPAA2_FAPR_OFFSET		0x10
> +#define DPAA2_FAPR_SIZE			sizeof((struct
> dpaa2_fapr))
> +
>  /* Frame annotation egress action descriptor */
>  #define DPAA2_FAEAD_OFFSET		0x58
>  
> @@ -185,6 +228,11 @@ static inline __le64 *dpaa2_get_ts(void
> *buf_addr, bool swa)
>  	return dpaa2_get_hwa(buf_addr, swa) + DPAA2_TS_OFFSET;
>  }
>  
> +static inline struct dpaa2_fapr *dpaa2_get_fapr(void *buf_addr, bool
> swa)
> +{
> +	return dpaa2_get_hwa(buf_addr, swa) + DPAA2_FAPR_OFFSET;
> +}
> +
>  static inline struct dpaa2_faead *dpaa2_get_faead(void *buf_addr,
> bool swa)
>  {
>  	return dpaa2_get_hwa(buf_addr, swa) + DPAA2_FAEAD_OFFSET;
> @@ -236,6 +284,9 @@ static inline struct dpaa2_faead
> *dpaa2_get_faead(void *buf_addr, bool swa)
>  					 DPAA2_FAS_L3CE		| \
>  					 DPAA2_FAS_L4CE)
>  
> +/* TCP indication in Frame Annotation Parse Results */
> +#define DPAA2_FAF_HI_TCP_PRESENT	BIT(23)
> +
>  /* Time in milliseconds between link state updates */
>  #define DPAA2_ETH_LINK_STATE_REFRESH	1000
>
David Miller July 23, 2019, 9:02 p.m. UTC | #2
From: Ioana Radulescu <ruxandra.radulescu@nxp.com>
Date: Tue, 23 Jul 2019 20:28:43 +0300

> Using Rx skb bulking for all frames may negatively impact the
> performance in some TCP termination scenarios, as it effectively
> bypasses GRO.

"may"?

Please provide numbers so that we know exactly whether it actually
hurts performance one way or another.

Thank you.
Eric Dumazet July 24, 2019, 7:10 a.m. UTC | #3
On 7/23/19 7:28 PM, Ioana Radulescu wrote:
> Using Rx skb bulking for all frames may negatively impact the
> performance in some TCP termination scenarios, as it effectively
> bypasses GRO.

>  
> -	list_add_tail(&skb->list, ch->rx_list);
> +	if (frame_is_tcp(fd, fas))
> +		napi_gro_receive(&ch->napi, skb);
> +	else
> +		list_add_tail(&skb->list, ch->rx_list);
>  
>  	return;
>  


This is really bad.

This is exactly why I suggested to add the batching capability to GRO,
instead having to change all drivers.

Edward Cree is working on this.
Ioana Radulescu July 24, 2019, 8:15 a.m. UTC | #4
> -----Original Message-----
> From: David Miller <davem@davemloft.net>
> Sent: Wednesday, July 24, 2019 12:03 AM
> To: Ioana Ciocoi Radulescu <ruxandra.radulescu@nxp.com>
> Cc: netdev@vger.kernel.org; Ioana Ciornei <ioana.ciornei@nxp.com>; Vladimir
> Oltean <vladimir.oltean@nxp.com>
> Subject: Re: [PATCH net-next] dpaa2-eth: Don't use netif_receive_skb_list for
> TCP frames
> 
> From: Ioana Radulescu <ruxandra.radulescu@nxp.com>
> Date: Tue, 23 Jul 2019 20:28:43 +0300
> 
> > Using Rx skb bulking for all frames may negatively impact the
> > performance in some TCP termination scenarios, as it effectively
> > bypasses GRO.
> 
> "may"?
> 
> Please provide numbers so that we know exactly whether it actually
> hurts performance one way or another.

We observed the worst degradation running netperf TCP_STREAM on a
setup with two 16cores LX2160A boards connected back-to-back, one
stream per cpu:
With netif_receive_skb_list() on all packets, we get a total bandwidth
of 41.6Gbps, with rx cpu load of 97%; after applying current patch, bw
increases to 45.8Gbps with rx cpu at 64%, which is similar to what we
got before using netif_receive_skb_list() in the first place.

On other platforms/setups the impact is lower, in some cases there is
no difference in throughput, only higher cpu load when skb batching
is used for TCP packets.

Anyway, based on feedback so far I guess the best path forward is to
withdraw this patch and wait for Edward's GRO batching work to be
accepted. I can help test those patches if needed.

Thanks,
Ioana
diff mbox series

Patch

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 0acb115..412f87f 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -348,6 +348,16 @@  static u32 run_xdp(struct dpaa2_eth_priv *priv,
 	return xdp_act;
 }
 
+static bool frame_is_tcp(const struct dpaa2_fd *fd, struct dpaa2_fas *fas)
+{
+	struct dpaa2_fapr *fapr = dpaa2_get_fapr(fas, false);
+
+	if (!(dpaa2_fd_get_frc(fd) & DPAA2_FD_FRC_FAPRV))
+		return false;
+
+	return !!(fapr->faf_hi & DPAA2_FAF_HI_TCP_PRESENT);
+}
+
 /* Main Rx frame processing routine */
 static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
 			 struct dpaa2_eth_channel *ch,
@@ -435,7 +445,10 @@  static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
 	percpu_stats->rx_packets++;
 	percpu_stats->rx_bytes += dpaa2_fd_get_len(fd);
 
-	list_add_tail(&skb->list, ch->rx_list);
+	if (frame_is_tcp(fd, fas))
+		napi_gro_receive(&ch->napi, skb);
+	else
+		list_add_tail(&skb->list, ch->rx_list);
 
 	return;
 
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
index 9af18c2..d723ae7 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h
@@ -155,6 +155,49 @@  struct dpaa2_fas {
  */
 #define DPAA2_TS_OFFSET			0x8
 
+/* Frame annotation parse results */
+struct dpaa2_fapr {
+	/* 64-bit word 1 */
+	__le32 faf_lo;
+	__le16 faf_ext;
+	__le16 nxt_hdr;
+	/* 64-bit word 2 */
+	__le64 faf_hi;
+	/* 64-bit word 3 */
+	u8 last_ethertype_offset;
+	u8 vlan_tci_offset_n;
+	u8 vlan_tci_offset_1;
+	u8 llc_snap_offset;
+	u8 eth_offset;
+	u8 ip1_pid_offset;
+	u8 shim_offset_2;
+	u8 shim_offset_1;
+	/* 64-bit word 4 */
+	u8 l5_offset;
+	u8 l4_offset;
+	u8 gre_offset;
+	u8 l3_offset_n;
+	u8 l3_offset_1;
+	u8 mpls_offset_n;
+	u8 mpls_offset_1;
+	u8 pppoe_offset;
+	/* 64-bit word 5 */
+	__le16 running_sum;
+	__le16 gross_running_sum;
+	u8 ipv6_frag_offset;
+	u8 nxt_hdr_offset;
+	u8 routing_hdr_offset_2;
+	u8 routing_hdr_offset_1;
+	/* 64-bit word 6 */
+	u8 reserved[5]; /* Soft-parsing context */
+	u8 ip_proto_offset_n;
+	u8 nxt_hdr_frag_offset;
+	u8 parse_error_code;
+};
+
+#define DPAA2_FAPR_OFFSET		0x10
+#define DPAA2_FAPR_SIZE			sizeof((struct dpaa2_fapr))
+
 /* Frame annotation egress action descriptor */
 #define DPAA2_FAEAD_OFFSET		0x58
 
@@ -185,6 +228,11 @@  static inline __le64 *dpaa2_get_ts(void *buf_addr, bool swa)
 	return dpaa2_get_hwa(buf_addr, swa) + DPAA2_TS_OFFSET;
 }
 
+static inline struct dpaa2_fapr *dpaa2_get_fapr(void *buf_addr, bool swa)
+{
+	return dpaa2_get_hwa(buf_addr, swa) + DPAA2_FAPR_OFFSET;
+}
+
 static inline struct dpaa2_faead *dpaa2_get_faead(void *buf_addr, bool swa)
 {
 	return dpaa2_get_hwa(buf_addr, swa) + DPAA2_FAEAD_OFFSET;
@@ -236,6 +284,9 @@  static inline struct dpaa2_faead *dpaa2_get_faead(void *buf_addr, bool swa)
 					 DPAA2_FAS_L3CE		| \
 					 DPAA2_FAS_L4CE)
 
+/* TCP indication in Frame Annotation Parse Results */
+#define DPAA2_FAF_HI_TCP_PRESENT	BIT(23)
+
 /* Time in milliseconds between link state updates */
 #define DPAA2_ETH_LINK_STATE_REFRESH	1000