Message ID | 1563902923-26178-1-git-send-email-ruxandra.radulescu@nxp.com |
---|---|
State | Changes Requested |
Delegated to: | David Miller |
Headers | show |
Series | [net-next] dpaa2-eth: Don't use netif_receive_skb_list for TCP frames | expand |
On Tue, 2019-07-23 at 20:28 +0300, Ioana Radulescu wrote: > Using Rx skb bulking for all frames may negatively impact the > performance in some TCP termination scenarios, as it effectively > bypasses GRO. > > Look at the hardware parse results of each ingress frame to see > if a TCP header is present or not; for TCP frames fall back to > the old implementation. > > Signed-off-by: Ioana Radulescu <ruxandra.radulescu@nxp.com> > Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> > --- > drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 15 ++++++- > drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h | 51 > ++++++++++++++++++++++++ > 2 files changed, 65 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c > b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c > index 0acb115..412f87f 100644 > --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c > +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c > @@ -348,6 +348,16 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, > return xdp_act; > } > > +static bool frame_is_tcp(const struct dpaa2_fd *fd, struct dpaa2_fas > *fas) > +{ > + struct dpaa2_fapr *fapr = dpaa2_get_fapr(fas, false); > + > + if (!(dpaa2_fd_get_frc(fd) & DPAA2_FD_FRC_FAPRV)) > + return false; > + > + return !!(fapr->faf_hi & DPAA2_FAF_HI_TCP_PRESENT); > +} > + > /* Main Rx frame processing routine */ > static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, > struct dpaa2_eth_channel *ch, > @@ -435,7 +445,10 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv > *priv, > percpu_stats->rx_packets++; > percpu_stats->rx_bytes += dpaa2_fd_get_len(fd); > > - list_add_tail(&skb->list, ch->rx_list); > + if (frame_is_tcp(fd, fas)) > + napi_gro_receive(&ch->napi, skb); > + else > + list_add_tail(&skb->list, ch->rx_list); > Maybe it is better if we introduce napi_gro_receive_list() and let napi decide when to do napi_gro_receive or list_add_tail, gro stack already knows how to make the decision, (no need to have a specialized hw parser for TCP frames) so all drivers will benefit from this and we will not repeat the same thing you did here in all drivers. also in such case napi/napi_gro will maintain the temporary rx skb list, which seems more natural than implementing it per driver. On napi_gro_receive_list: 1) Try gro 2) if the result is GRO_NORMAL, then add the skb to list 3) on napi_complete or when list budget is consumed flush skb list by calling netif_receive_skb_list. > return; > > diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h > b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h > index 9af18c2..d723ae7 100644 > --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h > +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h > @@ -155,6 +155,49 @@ struct dpaa2_fas { > */ > #define DPAA2_TS_OFFSET 0x8 > > +/* Frame annotation parse results */ > +struct dpaa2_fapr { > + /* 64-bit word 1 */ > + __le32 faf_lo; > + __le16 faf_ext; > + __le16 nxt_hdr; > + /* 64-bit word 2 */ > + __le64 faf_hi; > + /* 64-bit word 3 */ > + u8 last_ethertype_offset; > + u8 vlan_tci_offset_n; > + u8 vlan_tci_offset_1; > + u8 llc_snap_offset; > + u8 eth_offset; > + u8 ip1_pid_offset; > + u8 shim_offset_2; > + u8 shim_offset_1; > + /* 64-bit word 4 */ > + u8 l5_offset; > + u8 l4_offset; > + u8 gre_offset; > + u8 l3_offset_n; > + u8 l3_offset_1; > + u8 mpls_offset_n; > + u8 mpls_offset_1; > + u8 pppoe_offset; > + /* 64-bit word 5 */ > + __le16 running_sum; > + __le16 gross_running_sum; > + u8 ipv6_frag_offset; > + u8 nxt_hdr_offset; > + u8 routing_hdr_offset_2; > + u8 routing_hdr_offset_1; > + /* 64-bit word 6 */ > + u8 reserved[5]; /* Soft-parsing context */ > + u8 ip_proto_offset_n; > + u8 nxt_hdr_frag_offset; > + u8 parse_error_code; > +}; > + > +#define DPAA2_FAPR_OFFSET 0x10 > +#define DPAA2_FAPR_SIZE sizeof((struct > dpaa2_fapr)) > + > /* Frame annotation egress action descriptor */ > #define DPAA2_FAEAD_OFFSET 0x58 > > @@ -185,6 +228,11 @@ static inline __le64 *dpaa2_get_ts(void > *buf_addr, bool swa) > return dpaa2_get_hwa(buf_addr, swa) + DPAA2_TS_OFFSET; > } > > +static inline struct dpaa2_fapr *dpaa2_get_fapr(void *buf_addr, bool > swa) > +{ > + return dpaa2_get_hwa(buf_addr, swa) + DPAA2_FAPR_OFFSET; > +} > + > static inline struct dpaa2_faead *dpaa2_get_faead(void *buf_addr, > bool swa) > { > return dpaa2_get_hwa(buf_addr, swa) + DPAA2_FAEAD_OFFSET; > @@ -236,6 +284,9 @@ static inline struct dpaa2_faead > *dpaa2_get_faead(void *buf_addr, bool swa) > DPAA2_FAS_L3CE | \ > DPAA2_FAS_L4CE) > > +/* TCP indication in Frame Annotation Parse Results */ > +#define DPAA2_FAF_HI_TCP_PRESENT BIT(23) > + > /* Time in milliseconds between link state updates */ > #define DPAA2_ETH_LINK_STATE_REFRESH 1000 >
From: Ioana Radulescu <ruxandra.radulescu@nxp.com> Date: Tue, 23 Jul 2019 20:28:43 +0300 > Using Rx skb bulking for all frames may negatively impact the > performance in some TCP termination scenarios, as it effectively > bypasses GRO. "may"? Please provide numbers so that we know exactly whether it actually hurts performance one way or another. Thank you.
On 7/23/19 7:28 PM, Ioana Radulescu wrote: > Using Rx skb bulking for all frames may negatively impact the > performance in some TCP termination scenarios, as it effectively > bypasses GRO. > > - list_add_tail(&skb->list, ch->rx_list); > + if (frame_is_tcp(fd, fas)) > + napi_gro_receive(&ch->napi, skb); > + else > + list_add_tail(&skb->list, ch->rx_list); > > return; > This is really bad. This is exactly why I suggested to add the batching capability to GRO, instead having to change all drivers. Edward Cree is working on this.
> -----Original Message----- > From: David Miller <davem@davemloft.net> > Sent: Wednesday, July 24, 2019 12:03 AM > To: Ioana Ciocoi Radulescu <ruxandra.radulescu@nxp.com> > Cc: netdev@vger.kernel.org; Ioana Ciornei <ioana.ciornei@nxp.com>; Vladimir > Oltean <vladimir.oltean@nxp.com> > Subject: Re: [PATCH net-next] dpaa2-eth: Don't use netif_receive_skb_list for > TCP frames > > From: Ioana Radulescu <ruxandra.radulescu@nxp.com> > Date: Tue, 23 Jul 2019 20:28:43 +0300 > > > Using Rx skb bulking for all frames may negatively impact the > > performance in some TCP termination scenarios, as it effectively > > bypasses GRO. > > "may"? > > Please provide numbers so that we know exactly whether it actually > hurts performance one way or another. We observed the worst degradation running netperf TCP_STREAM on a setup with two 16cores LX2160A boards connected back-to-back, one stream per cpu: With netif_receive_skb_list() on all packets, we get a total bandwidth of 41.6Gbps, with rx cpu load of 97%; after applying current patch, bw increases to 45.8Gbps with rx cpu at 64%, which is similar to what we got before using netif_receive_skb_list() in the first place. On other platforms/setups the impact is lower, in some cases there is no difference in throughput, only higher cpu load when skb batching is used for TCP packets. Anyway, based on feedback so far I guess the best path forward is to withdraw this patch and wait for Edward's GRO batching work to be accepted. I can help test those patches if needed. Thanks, Ioana
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 0acb115..412f87f 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -348,6 +348,16 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, return xdp_act; } +static bool frame_is_tcp(const struct dpaa2_fd *fd, struct dpaa2_fas *fas) +{ + struct dpaa2_fapr *fapr = dpaa2_get_fapr(fas, false); + + if (!(dpaa2_fd_get_frc(fd) & DPAA2_FD_FRC_FAPRV)) + return false; + + return !!(fapr->faf_hi & DPAA2_FAF_HI_TCP_PRESENT); +} + /* Main Rx frame processing routine */ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, struct dpaa2_eth_channel *ch, @@ -435,7 +445,10 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, percpu_stats->rx_packets++; percpu_stats->rx_bytes += dpaa2_fd_get_len(fd); - list_add_tail(&skb->list, ch->rx_list); + if (frame_is_tcp(fd, fas)) + napi_gro_receive(&ch->napi, skb); + else + list_add_tail(&skb->list, ch->rx_list); return; diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 9af18c2..d723ae7 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -155,6 +155,49 @@ struct dpaa2_fas { */ #define DPAA2_TS_OFFSET 0x8 +/* Frame annotation parse results */ +struct dpaa2_fapr { + /* 64-bit word 1 */ + __le32 faf_lo; + __le16 faf_ext; + __le16 nxt_hdr; + /* 64-bit word 2 */ + __le64 faf_hi; + /* 64-bit word 3 */ + u8 last_ethertype_offset; + u8 vlan_tci_offset_n; + u8 vlan_tci_offset_1; + u8 llc_snap_offset; + u8 eth_offset; + u8 ip1_pid_offset; + u8 shim_offset_2; + u8 shim_offset_1; + /* 64-bit word 4 */ + u8 l5_offset; + u8 l4_offset; + u8 gre_offset; + u8 l3_offset_n; + u8 l3_offset_1; + u8 mpls_offset_n; + u8 mpls_offset_1; + u8 pppoe_offset; + /* 64-bit word 5 */ + __le16 running_sum; + __le16 gross_running_sum; + u8 ipv6_frag_offset; + u8 nxt_hdr_offset; + u8 routing_hdr_offset_2; + u8 routing_hdr_offset_1; + /* 64-bit word 6 */ + u8 reserved[5]; /* Soft-parsing context */ + u8 ip_proto_offset_n; + u8 nxt_hdr_frag_offset; + u8 parse_error_code; +}; + +#define DPAA2_FAPR_OFFSET 0x10 +#define DPAA2_FAPR_SIZE sizeof((struct dpaa2_fapr)) + /* Frame annotation egress action descriptor */ #define DPAA2_FAEAD_OFFSET 0x58 @@ -185,6 +228,11 @@ static inline __le64 *dpaa2_get_ts(void *buf_addr, bool swa) return dpaa2_get_hwa(buf_addr, swa) + DPAA2_TS_OFFSET; } +static inline struct dpaa2_fapr *dpaa2_get_fapr(void *buf_addr, bool swa) +{ + return dpaa2_get_hwa(buf_addr, swa) + DPAA2_FAPR_OFFSET; +} + static inline struct dpaa2_faead *dpaa2_get_faead(void *buf_addr, bool swa) { return dpaa2_get_hwa(buf_addr, swa) + DPAA2_FAEAD_OFFSET; @@ -236,6 +284,9 @@ static inline struct dpaa2_faead *dpaa2_get_faead(void *buf_addr, bool swa) DPAA2_FAS_L3CE | \ DPAA2_FAS_L4CE) +/* TCP indication in Frame Annotation Parse Results */ +#define DPAA2_FAF_HI_TCP_PRESENT BIT(23) + /* Time in milliseconds between link state updates */ #define DPAA2_ETH_LINK_STATE_REFRESH 1000